Muennighoff commited on
Commit
b220bd1
1 Parent(s): b101f59
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 8b7178b13b/evaluation/8b7178b13b_0_babi.json +22 -0
  2. 8b7178b13b/evaluation/8b7178b13b_1_babi.json +22 -0
  3. 8b7178b13b/evaluation/8b7178b13b_2_babi.json +22 -0
  4. 8b7178b13b/evaluation/8b7178b13b_3_babi.json +22 -0
  5. 8b7178b13b/evaluation/8b7178b13b_4_babi.json +22 -0
  6. 8b7178b13b/evaluation/8b7178b13b_5_babi.json +22 -0
  7. 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  8. 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  9. 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
  10. 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
  11. 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json +1 -0
  12. 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json +1 -0
  13. 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json +1 -0
  14. 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl +3 -0
  15. 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl +3 -0
  16. 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl +3 -0
  17. 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
  18. 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl +3 -0
  19. 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl +3 -0
  20. 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl +3 -0
  21. 8b7178b13b/evaluation/generation/merged.csv +53 -0
  22. 8b7178b13b/evaluation/generation/merged.json +1 -0
  23. 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
  24. 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
  25. 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
  26. 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
  27. 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json +133 -0
  28. 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json +133 -0
  29. 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json +133 -0
  30. 8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv +21 -0
  31. 8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-07-19_0shots_backup.json +0 -87
  32. 8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv +21 -0
  33. 8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-07-19_1shots_backup.json +0 -87
  34. 8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv +21 -0
  35. 8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-07-19_2shots_backup.json +0 -87
  36. 8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv +21 -0
  37. 8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-07-25_3shots_backup.json +0 -87
  38. 8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv +21 -0
  39. 8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-06-46_4shots_backup.json +0 -87
  40. 8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv +21 -0
  41. 8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-46_5shots_backup.json +0 -87
  42. 8b7178b13b/transformers/merges.txt +0 -0
  43. 8b7178b13b/transformers/tokenizer.json +0 -0
  44. 8b7178b13b/transformers/vocab.json +3 -0
  45. 8b7178b178b/evaluation/8b7178b178b_0_babi.json +22 -0
  46. 8b7178b178b/evaluation/8b7178b178b_1_babi.json +22 -0
  47. 8b7178b178b/evaluation/8b7178b178b_2_babi.json +22 -0
  48. 8b7178b178b/evaluation/8b7178b178b_3_babi.json +22 -0
  49. 8b7178b178b/evaluation/8b7178b178b_4_babi.json +22 -0
  50. 8b7178b178b/evaluation/8b7178b178b_5_babi.json +22 -0
8b7178b13b/evaluation/8b7178b13b_0_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 0,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/8b7178b13b_1_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 1,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/8b7178b13b_2_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 2,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/8b7178b13b_3_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 3,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/8b7178b13b_4_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 4,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/8b7178b13b_5_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 5,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.19198287608229725, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021092410953077365}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.32589953688933776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002753157025629824}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.22344837173188767, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018933971382747686}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.048137852198349124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010124293758406351}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08482162860321114, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017481381213950958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05600464749019431, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001035304930307184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.13481580311964722, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014071194011638184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.23733416551629913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00219502068666546}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15854970566001372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012774189115518982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.18117711700554173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019909544088953742}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.30882558318305736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026454321668364913}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.21107970658856343, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017866890110014018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.013745331324814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06592229302797037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.16688833624223554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024001395431129838}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.27404204769144364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003375962083519381}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1883894874399809, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022518375274135087}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04136653299690803, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010174651332227344}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07172155304298297, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017383368313820028}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04708204631655152, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010069752068567668}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11810699062222797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001714836740267086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19959382723616914, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002605476332476844}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13391288059874437, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001564486351392238}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.15789252138792423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022764600108588808}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.26023963410461465, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003240567155894993}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17834159419543702, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002133544799751138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.986263870634216, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0869488541072268}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.05571162128505982, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019790843122430768}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.09490155511879607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003216109108055619}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06229352605061771, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002059973094043126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.013746904154847127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000658207908545434}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02605352352304377, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013317962968417952}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01596233535299914, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007332643966253175}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04028867827357096, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014354449068175981}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.07013160574213247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024380518058086703}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04502172063067562, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014806563009305842}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.0526446584524354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018735818695187819}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.08977749314111799, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030541817569144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05884434373920439, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019486127272992522}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.7653537900871676, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.049323102026813614}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.009989872401580602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009952828000827124}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.015276156161555617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001444671711873694}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010262809104423124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000941433577191994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0024167502513949756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00033798462070527304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.004213844763908219, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006183240014712593}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0025848086028049965, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003301678198930521}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.007350732687047713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000736612195363616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.011512337859684543, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011311946838375108}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.007530769725575652, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006918159097560924}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.009401823549618816, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009477935776177291}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.014304994804442096, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013529102153531946}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009601062566883777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008823539930029628}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.313122283850011e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 4.438365983452618e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14077378985561326, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002167627071465062}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3365881713882994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004904093832075129}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19482488018157684, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028449440307477085}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03346907677060051, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00115993469618386}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08385926778368435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030304591639245464}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04702347182983231, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001623530160402657}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10579758272401656, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016080916023179224}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.25509688514423123, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003826588022233898}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14665273300314152, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00212580713201847}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11263878026618175, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00176128619109869}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2715816804884895, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004199504898315826}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15621401458929957, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00234420229722605}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.03791695157546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1342348737065576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.04332722979741195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025397484135024503}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.08638484320409213, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004855712915740289}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05341124486563334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029118807193791816}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.009371982127377696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007761604812612375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.02197790550576286, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018737167719494205}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0126600494371876, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010393927624132967}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03308111037957417, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002037483672167313}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06547067873377617, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037402890017668913}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04031062592059575, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022065055222514843}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03547102946933422, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002145817702406172}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.07066149952031933, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004048133700252855}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04344922023460101, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002384615926685988}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.9907623400783704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12144441541342028}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002432491256372859, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006531802890059347}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0035688165030904754, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.000969634171191206}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0026267053439145405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006851925438686376}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0002462802129838755, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00010248150275091556}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00045873937383371344, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002027676865696476}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0003000575858304317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00012455541465929192}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.001824459823195743, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000493813415204384}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.002706254470313614, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007319861753860738}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001963800127619678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005077215919641312}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.001947339725356528, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005292516418451348}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0028418243363945105, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007704086076680828}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.002094899399917077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005500941719652507}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.959625061740418e-19, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.1984703371139495e-15}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e1dbeb121b2aa6b21362cc27760dfac9df5f0c4392de6dc66b55afa2d2ddcdb
3
+ size 18917037
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9170e4bf6e70cb0474e5cc61a22d09edf230e43fbfc24971c3ca80e1f9303ace
3
+ size 24337079
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7121ccb97fa1421eb5d54fd4c30f575c47407e8e1d22029847bd1bc796605b98
3
+ size 29478626
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d4f84f2bf12c2c7f4343c7a75d12d4f3b26590608f8e73f6feb6f4760f1adf8
3
+ size 34801616
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbbd5b38d4479e236de19fb18b0426fc754c283c908c6551a1c23132a8bec707
3
+ size 9647191
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efb257030811a985c3b288620cf2719b5f2f7b4cf73388bcdb5057bd80ab93d9
3
+ size 11673906
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc87da448769f33c6956c2aade6eb6ad69e8c8db403d47f83be5244751597f78
3
+ size 13899064
8b7178b13b/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.013098749477153882
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.013098749477153882
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.17675735102863532
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.17675735102863532
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.20791813280930055
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.20791813280930055
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2138339925008178
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2138339925008178
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.21707781052843164
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.21707781052843164
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.21807002700335165
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.21807002700335165
14
+ e2e_nlg_cleaned,5,average,multiple,0.17445934389128182
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05279448511938101
16
+ gem_xsum,0,median,rouge2_fmeasure,0.05279448511938101
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03829439148207006
18
+ gem_xsum,1,median,rouge2_fmeasure,0.03829439148207006
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04760922765448965
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04760922765448965
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04702347182983231
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04702347182983231
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0126600494371876
24
+ gem_xsum,4,median,rouge2_fmeasure,0.0126600494371876
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003000575858304317
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0003000575858304317
27
+ gem_xsum,5,average,multiple,0.033113613851465176
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.051001464150726095
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.051001464150726095
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05455324665630359
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.05455324665630359
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.055583861631680276
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.055583861631680276
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05769919253564089
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05769919253564089
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.06050302495157815
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.06050302495157815
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.06252454636994373
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.06252454636994373
40
+ web_nlg_en,5,average,multiple,0.05697755604931212
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04191366659253858
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.04191366659253858
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04827166271383884
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.04827166271383884
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05600464749019431
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.05600464749019431
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04708204631655152
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04708204631655152
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01596233535299914
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.01596233535299914
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0025848086028049965
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0025848086028049965
53
+ wiki_lingua_en,5,average,multiple,0.0353031945114879
8b7178b13b/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4373157998206226, "bleu_stderr": 0.056741486001354216, "rouge1_fmeasure": 0.10947380559420297, "rouge1_fmeasure_stderr": 0.0022511546576554292, "rouge1_precision": 0.07630675677312203, "rouge1_precision_stderr": 0.0021365872076980082, "rouge1_recall": 0.2947830429134024, "rouge1_recall_stderr": 0.0047665492114514684, "rouge2_fmeasure": 0.051001464150726095, "rouge2_fmeasure_stderr": 0.0014146940985046523, "rouge2_precision": 0.03437396040114199, "rouge2_precision_stderr": 0.0011646295650624351, "rouge2_recall": 0.13907462394894776, "rouge2_recall_stderr": 0.0031715068123442996, "rougeL_fmeasure": 0.10417993615217146, "rougeL_fmeasure_stderr": 0.002043414019393663, "rougeL_precision": 0.07227253003813615, "rougeL_precision_stderr": 0.001955466598501427, "rougeL_recall": 0.28336049631237586, "rougeL_recall_stderr": 0.004569573466829386, "rougeLsum_fmeasure": 0.10462419732455051, "rougeLsum_fmeasure_stderr": 0.002115527416565905, "rougeLsum_precision": 0.07290032072974349, "rougeLsum_precision_stderr": 0.002034641349821296, "rougeLsum_recall": 0.2825727583996548, "rougeLsum_recall_stderr": 0.004541613699421234}}, "1": {"PALM_prompt": {"bleu": 0.5816764130686213, "bleu_stderr": 0.054147923323820346, "rouge1_fmeasure": 0.11727270364668571, "rouge1_fmeasure_stderr": 0.0021476260743326565, "rouge1_precision": 0.07783086970109386, "rouge1_precision_stderr": 0.001926390801136041, "rouge1_recall": 0.36711372145643695, "rouge1_recall_stderr": 0.005207087210896133, "rouge2_fmeasure": 0.05455324665630359, "rouge2_fmeasure_stderr": 0.0013665856855669073, "rouge2_precision": 0.035941971570071736, "rouge2_precision_stderr": 0.0011116447277521812, "rouge2_recall": 0.17604536516651662, "rouge2_recall_stderr": 0.0036353372642678756, "rougeL_fmeasure": 0.10871994085882164, "rougeL_fmeasure_stderr": 0.0019153699814912425, "rougeL_precision": 0.07209000868611247, "rougeL_precision_stderr": 0.0017559014471748775, "rougeL_recall": 0.33901675935925657, "rougeL_recall_stderr": 0.004687996190515175, "rougeLsum_fmeasure": 0.11053867463797286, "rougeLsum_fmeasure_stderr": 0.0020173857832705686, "rougeLsum_precision": 0.0735230628158407, "rougeLsum_precision_stderr": 0.0018435905984185187, "rougeLsum_recall": 0.34433610068323095, "rougeLsum_recall_stderr": 0.004792623835429767}}, "2": {"PALM_prompt": {"bleu": 0.6801735381749798, "bleu_stderr": 0.04514863078336761, "rouge1_fmeasure": 0.11972585427910692, "rouge1_fmeasure_stderr": 0.001963983917319172, "rouge1_precision": 0.07666591908388082, "rouge1_precision_stderr": 0.0015425382902699712, "rouge1_recall": 0.40028519820196484, "rouge1_recall_stderr": 0.0053218199498182555, "rouge2_fmeasure": 0.055583861631680276, "rouge2_fmeasure_stderr": 0.0012687258801338563, "rouge2_precision": 0.03562073396147935, "rouge2_precision_stderr": 0.0009954366405729978, "rouge2_recall": 0.19724892000097954, "rouge2_recall_stderr": 0.004019172908229666, "rougeL_fmeasure": 0.10999138993564177, "rougeL_fmeasure_stderr": 0.0017648420950003858, "rougeL_precision": 0.07037367893965144, "rougeL_precision_stderr": 0.0013558489579657599, "rougeL_recall": 0.3648716666842121, "rougeL_recall_stderr": 0.004723267009132289, "rougeLsum_fmeasure": 0.1122905686653009, "rougeLsum_fmeasure_stderr": 0.0018425170251459826, "rougeLsum_precision": 0.07198504559682002, "rougeLsum_precision_stderr": 0.0014532243063151847, "rougeLsum_recall": 0.3737640227903558, "rougeLsum_recall_stderr": 0.004892612299725926}}, "3": {"PALM_prompt": {"bleu": 0.8393015493701177, "bleu_stderr": 0.049868510474836, "rouge1_fmeasure": 0.12332290364001403, "rouge1_fmeasure_stderr": 0.0020213255478620834, "rouge1_precision": 0.07934496701755703, "rouge1_precision_stderr": 0.0017108996646152186, "rouge1_recall": 0.41957311539245606, "rouge1_recall_stderr": 0.005383968460349908, "rouge2_fmeasure": 0.05769919253564089, "rouge2_fmeasure_stderr": 0.001315453385968732, "rouge2_precision": 0.03700401446806371, "rouge2_precision_stderr": 0.0010754080423489075, "rouge2_recall": 0.20785188427731804, "rouge2_recall_stderr": 0.0039850136524860235, "rougeL_fmeasure": 0.11171273283361995, "rougeL_fmeasure_stderr": 0.0017620828691461753, "rougeL_precision": 0.07186075331384528, "rougeL_precision_stderr": 0.0015065682717614625, "rougeL_recall": 0.3792930717112658, "rougeL_recall_stderr": 0.004756348746075166, "rougeLsum_fmeasure": 0.11543602475589927, "rougeLsum_fmeasure_stderr": 0.001888123216509075, "rougeLsum_precision": 0.07439776049493438, "rougeLsum_precision_stderr": 0.0016078356678010679, "rougeLsum_recall": 0.39147213524665503, "rougeLsum_recall_stderr": 0.004968164479358761}}, "4": {"PALM_prompt": {"bleu": 0.8436391155168642, "bleu_stderr": 0.049789928911136136, "rouge1_fmeasure": 0.12951808355992003, "rouge1_fmeasure_stderr": 0.0019403013558157668, "rouge1_precision": 0.08188000384427346, "rouge1_precision_stderr": 0.0015429926759057174, "rouge1_recall": 0.45225212783083296, "rouge1_recall_stderr": 0.005452056426418837, "rouge2_fmeasure": 0.06050302495157815, "rouge2_fmeasure_stderr": 0.0012369724150331927, "rouge2_precision": 0.03789712819393572, "rouge2_precision_stderr": 0.0009152136077591718, "rouge2_recall": 0.22689495723762804, "rouge2_recall_stderr": 0.004067188852506108, "rougeL_fmeasure": 0.11592286526905457, "rougeL_fmeasure_stderr": 0.0016906240203803347, "rougeL_precision": 0.07330729359090057, "rougeL_precision_stderr": 0.0013644419423480823, "rougeL_recall": 0.4045943171397325, "rougeL_recall_stderr": 0.0048320778663538955, "rougeLsum_fmeasure": 0.12060522562383363, "rougeLsum_fmeasure_stderr": 0.0018022149452879743, "rougeLsum_precision": 0.07637812548771711, "rougeLsum_precision_stderr": 0.0014540942217052375, "rougeLsum_recall": 0.4207633762253156, "rougeLsum_recall_stderr": 0.005005614020197891}}, "5": {"PALM_prompt": {"bleu": 0.968801645723095, "bleu_stderr": 0.05462868115593731, "rouge1_fmeasure": 0.13468892691608006, "rouge1_fmeasure_stderr": 0.0019240731337134673, "rouge1_precision": 0.08408541082259362, "rouge1_precision_stderr": 0.001415159971957011, "rouge1_recall": 0.4730441639084742, "rouge1_recall_stderr": 0.005431978662817184, "rouge2_fmeasure": 0.06252454636994373, "rouge2_fmeasure_stderr": 0.0012377179103821956, "rouge2_precision": 0.038803485654806714, "rouge2_precision_stderr": 0.0008788058278028121, "rouge2_recall": 0.2379631167113304, "rouge2_recall_stderr": 0.004152278672512853, "rougeL_fmeasure": 0.11896213494013362, "rougeL_fmeasure_stderr": 0.001650634135835398, "rougeL_precision": 0.07423644311431606, "rougeL_precision_stderr": 0.0012142160071926726, "rougeL_recall": 0.4194644730807224, "rougeL_recall_stderr": 0.0047799777603664715, "rougeLsum_fmeasure": 0.12479160570822044, "rougeLsum_fmeasure_stderr": 0.0017844920178026805, "rougeLsum_precision": 0.07800322392434518, "rougeLsum_precision_stderr": 0.0013202039315876929, "rougeLsum_recall": 0.43851085762444947, "rougeLsum_recall_stderr": 0.004968418620592047}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.8310798418084737, "bleu_stderr": 0.06119305662626742, "rouge1_fmeasure": 0.19188833167040015, "rouge1_fmeasure_stderr": 0.0018702943944022418, "rouge1_precision": 0.1628352267958049, "rouge1_precision_stderr": 0.0019076289743005436, "rouge1_recall": 0.28137034403026634, "rouge1_recall_stderr": 0.002726139405286867, "rouge2_fmeasure": 0.04191366659253858, "rouge2_fmeasure_stderr": 0.000917658619128283, "rouge2_precision": 0.035168122195033014, "rouge2_precision_stderr": 0.0008051014260490183, "rouge2_recall": 0.06371619685684557, "rouge2_recall_stderr": 0.0015314676134498058, "rougeL_fmeasure": 0.14614932555210797, "rougeL_fmeasure_stderr": 0.0012956973040530264, "rougeL_precision": 0.12246239102421737, "rougeL_precision_stderr": 0.001285311648356421, "rougeL_recall": 0.2201424968967575, "rougeL_recall_stderr": 0.0022006617198564397, "rougeLsum_fmeasure": 0.17662543049894286, "rougeLsum_fmeasure_stderr": 0.001716560474974557, "rougeLsum_precision": 0.14960133985735732, "rougeLsum_precision_stderr": 0.0017445585328390875, "rougeLsum_recall": 0.2600296050390341, "rougeLsum_recall_stderr": 0.002552101400202146}}, "1": {"tldr_en": {"bleu": 2.463691563435929, "bleu_stderr": 0.056238669871763534, "rouge1_fmeasure": 0.20607218590152535, "rouge1_fmeasure_stderr": 0.0019348192676270253, "rouge1_precision": 0.17507251962841255, "rouge1_precision_stderr": 0.0019979193178610104, "rouge1_recall": 0.3008600636001003, "rouge1_recall_stderr": 0.002760929751246215, "rouge2_fmeasure": 0.04827166271383884, "rouge2_fmeasure_stderr": 0.000981638580890638, "rouge2_precision": 0.040814840621502924, "rouge2_precision_stderr": 0.0008769075552676463, "rouge2_recall": 0.07239534384858555, "rouge2_recall_stderr": 0.0015745819458817655, "rougeL_fmeasure": 0.14993882097099004, "rougeL_fmeasure_stderr": 0.0013143528095662098, "rougeL_precision": 0.12604513290190442, "rougeL_precision_stderr": 0.001331288983523961, "rougeL_recall": 0.22410236240350495, "rougeL_recall_stderr": 0.0021252698749924166, "rougeLsum_fmeasure": 0.19267274247494454, "rougeLsum_fmeasure_stderr": 0.0018042087124662393, "rougeLsum_precision": 0.1634506211607229, "rougeLsum_precision_stderr": 0.0018573519091366366, "rougeLsum_recall": 0.282240921650429, "rougeLsum_recall_stderr": 0.002619133620350951}}, "2": {"tldr_en": {"bleu": 3.013745331324814, "bleu_stderr": 0.06592229302797037, "rouge1_fmeasure": 0.22344837173188767, "rouge1_fmeasure_stderr": 0.0018933971382747686, "rouge1_precision": 0.19198287608229725, "rouge1_precision_stderr": 0.0021092410953077365, "rouge1_recall": 0.32589953688933776, "rouge1_recall_stderr": 0.002753157025629824, "rouge2_fmeasure": 0.05600464749019431, "rouge2_fmeasure_stderr": 0.001035304930307184, "rouge2_precision": 0.048137852198349124, "rouge2_precision_stderr": 0.0010124293758406351, "rouge2_recall": 0.08482162860321114, "rouge2_recall_stderr": 0.0017481381213950958, "rougeL_fmeasure": 0.15854970566001372, "rougeL_fmeasure_stderr": 0.0012774189115518982, "rougeL_precision": 0.13481580311964722, "rougeL_precision_stderr": 0.0014071194011638184, "rougeL_recall": 0.23733416551629913, "rougeL_recall_stderr": 0.00219502068666546, "rougeLsum_fmeasure": 0.21107970658856343, "rougeLsum_fmeasure_stderr": 0.0017866890110014018, "rougeLsum_precision": 0.18117711700554173, "rougeLsum_precision_stderr": 0.0019909544088953742, "rougeLsum_recall": 0.30882558318305736, "rougeLsum_recall_stderr": 0.0026454321668364913}}, "3": {"tldr_en": {"bleu": 2.986263870634216, "bleu_stderr": 0.0869488541072268, "rouge1_fmeasure": 0.1883894874399809, "rouge1_fmeasure_stderr": 0.0022518375274135087, "rouge1_precision": 0.16688833624223554, "rouge1_precision_stderr": 0.0024001395431129838, "rouge1_recall": 0.27404204769144364, "rouge1_recall_stderr": 0.003375962083519381, "rouge2_fmeasure": 0.04708204631655152, "rouge2_fmeasure_stderr": 0.0010069752068567668, "rouge2_precision": 0.04136653299690803, "rouge2_precision_stderr": 0.0010174651332227344, "rouge2_recall": 0.07172155304298297, "rouge2_recall_stderr": 0.0017383368313820028, "rougeL_fmeasure": 0.13391288059874437, "rougeL_fmeasure_stderr": 0.001564486351392238, "rougeL_precision": 0.11810699062222797, "rougeL_precision_stderr": 0.001714836740267086, "rougeL_recall": 0.19959382723616914, "rougeL_recall_stderr": 0.002605476332476844, "rougeLsum_fmeasure": 0.17834159419543702, "rougeLsum_fmeasure_stderr": 0.002133544799751138, "rougeLsum_precision": 0.15789252138792423, "rougeLsum_precision_stderr": 0.0022764600108588808, "rougeLsum_recall": 0.26023963410461465, "rougeLsum_recall_stderr": 0.003240567155894993}}, "4": {"tldr_en": {"bleu": 0.7653537900871676, "bleu_stderr": 0.049323102026813614, "rouge1_fmeasure": 0.06229352605061771, "rouge1_fmeasure_stderr": 0.002059973094043126, "rouge1_precision": 0.05571162128505982, "rouge1_precision_stderr": 0.0019790843122430768, "rouge1_recall": 0.09490155511879607, "rouge1_recall_stderr": 0.003216109108055619, "rouge2_fmeasure": 0.01596233535299914, "rouge2_fmeasure_stderr": 0.0007332643966253175, "rouge2_precision": 0.013746904154847127, "rouge2_precision_stderr": 0.000658207908545434, "rouge2_recall": 0.02605352352304377, "rouge2_recall_stderr": 0.0013317962968417952, "rougeL_fmeasure": 0.04502172063067562, "rougeL_fmeasure_stderr": 0.0014806563009305842, "rougeL_precision": 0.04028867827357096, "rougeL_precision_stderr": 0.0014354449068175981, "rougeL_recall": 0.07013160574213247, "rougeL_recall_stderr": 0.0024380518058086703, "rougeLsum_fmeasure": 0.05884434373920439, "rougeLsum_fmeasure_stderr": 0.0019486127272992522, "rougeLsum_precision": 0.0526446584524354, "rougeLsum_precision_stderr": 0.0018735818695187819, "rougeLsum_recall": 0.08977749314111799, "rougeLsum_recall_stderr": 0.0030541817569144}}, "5": {"tldr_en": {"bleu": 2.313122283850011e-06, "bleu_stderr": 4.438365983452618e-06, "rouge1_fmeasure": 0.010262809104423124, "rouge1_fmeasure_stderr": 0.000941433577191994, "rouge1_precision": 0.009989872401580602, "rouge1_precision_stderr": 0.0009952828000827124, "rouge1_recall": 0.015276156161555617, "rouge1_recall_stderr": 0.001444671711873694, "rouge2_fmeasure": 0.0025848086028049965, "rouge2_fmeasure_stderr": 0.0003301678198930521, "rouge2_precision": 0.0024167502513949756, "rouge2_precision_stderr": 0.00033798462070527304, "rouge2_recall": 0.004213844763908219, "rouge2_recall_stderr": 0.0006183240014712593, "rougeL_fmeasure": 0.007530769725575652, "rougeL_fmeasure_stderr": 0.0006918159097560924, "rougeL_precision": 0.007350732687047713, "rougeL_precision_stderr": 0.000736612195363616, "rougeL_recall": 0.011512337859684543, "rougeL_recall_stderr": 0.0011311946838375108, "rougeLsum_fmeasure": 0.009601062566883777, "rougeLsum_fmeasure_stderr": 0.0008823539930029628, "rougeLsum_precision": 0.009401823549618816, "rougeLsum_precision_stderr": 0.0009477935776177291, "rougeLsum_recall": 0.014304994804442096, "rougeLsum_recall_stderr": 0.0013529102153531946}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.3016960813220789, "bleu_stderr": 0.04082243698870125, "rouge1_fmeasure": 0.08446489060239885, "rouge1_fmeasure_stderr": 0.000832880994211257, "rouge1_precision": 0.0628362517057776, "rouge1_precision_stderr": 0.0006647851313912593, "rouge1_recall": 0.137799699754724, "rouge1_recall_stderr": 0.0013399199497499206, "rouge2_fmeasure": 0.013098749477153882, "rouge2_fmeasure_stderr": 0.00036393407871382073, "rouge2_precision": 0.009897778438587298, "rouge2_precision_stderr": 0.0002813167869752497, "rouge2_recall": 0.02030644827526888, "rouge2_recall_stderr": 0.0005626168002295672, "rougeL_fmeasure": 0.08185345358329085, "rougeL_fmeasure_stderr": 0.0007758077715250189, "rougeL_precision": 0.060894085509621707, "rougeL_precision_stderr": 0.0006165662598640652, "rougeL_recall": 0.1334600824136164, "rougeL_recall_stderr": 0.0012622226683087598, "rougeLsum_fmeasure": 0.07821933484300712, "rougeLsum_fmeasure_stderr": 0.000742258462877918, "rougeLsum_precision": 0.05822489106803366, "rougeLsum_precision_stderr": 0.00059732648786741, "rougeLsum_recall": 0.12759747263850774, "rougeLsum_recall_stderr": 0.0011920315451119745}}, "1": {"generate_text_restaurant": {"bleu": 9.014911506469076, "bleu_stderr": 0.10639404635916568, "rouge1_fmeasure": 0.4071860181343587, "rouge1_fmeasure_stderr": 0.002172401493926826, "rouge1_precision": 0.4220547738164344, "rouge1_precision_stderr": 0.0029747898853502135, "rouge1_recall": 0.44890682882262095, "rouge1_recall_stderr": 0.0030482256607117985, "rouge2_fmeasure": 0.17675735102863532, "rouge2_fmeasure_stderr": 0.0016983524897307275, "rouge2_precision": 0.18398717679998686, "rouge2_precision_stderr": 0.002070314538395242, "rouge2_recall": 0.19695708046780447, "rouge2_recall_stderr": 0.0021253468809232486, "rougeL_fmeasure": 0.2901955244768342, "rougeL_fmeasure_stderr": 0.0017445755551563752, "rougeL_precision": 0.301979337271323, "rougeL_precision_stderr": 0.002426170411020153, "rougeL_recall": 0.32120889976820555, "rougeL_recall_stderr": 0.0024556761259021593, "rougeLsum_fmeasure": 0.3366538075182426, "rougeLsum_fmeasure_stderr": 0.0020840116286440125, "rougeLsum_precision": 0.3498631569304881, "rougeLsum_precision_stderr": 0.00275693533807641, "rougeLsum_recall": 0.3704033862486897, "rougeLsum_recall_stderr": 0.00277359280896307}}, "2": {"generate_text_restaurant": {"bleu": 11.1355912496663, "bleu_stderr": 0.13104161087327487, "rouge1_fmeasure": 0.4449026725503146, "rouge1_fmeasure_stderr": 0.0020303864410380614, "rouge1_precision": 0.45419518183344554, "rouge1_precision_stderr": 0.0027266345121146297, "rouge1_recall": 0.48044140772752547, "rouge1_recall_stderr": 0.002817950150203246, "rouge2_fmeasure": 0.20791813280930055, "rouge2_fmeasure_stderr": 0.0017732074653438918, "rouge2_precision": 0.2124884501715128, "rouge2_precision_stderr": 0.0020421137596173576, "rouge2_recall": 0.22663601203825093, "rouge2_recall_stderr": 0.0021799430214164035, "rougeL_fmeasure": 0.32200623778436555, "rougeL_fmeasure_stderr": 0.0017788348099423016, "rougeL_precision": 0.32873823752244813, "rougeL_precision_stderr": 0.002262430136729236, "rougeL_recall": 0.34880236417169236, "rougeL_recall_stderr": 0.0024064753259342344, "rougeLsum_fmeasure": 0.37259609212766276, "rougeLsum_fmeasure_stderr": 0.0020305961255707534, "rougeLsum_precision": 0.38047010773438, "rougeLsum_precision_stderr": 0.002559842275120574, "rougeLsum_recall": 0.40224234667337105, "rougeLsum_recall_stderr": 0.002660545444016772}}, "3": {"generate_text_restaurant": {"bleu": 12.10470203084112, "bleu_stderr": 0.160895558301053, "rouge1_fmeasure": 0.4492382386896765, "rouge1_fmeasure_stderr": 0.0019699301602619833, "rouge1_precision": 0.45456971607102353, "rouge1_precision_stderr": 0.0023960763531178423, "rouge1_recall": 0.47953391372338766, "rouge1_recall_stderr": 0.0027865520920584822, "rouge2_fmeasure": 0.2138339925008178, "rouge2_fmeasure_stderr": 0.0018445417129634963, "rouge2_precision": 0.2163784140584202, "rouge2_precision_stderr": 0.001980036007401307, "rouge2_recall": 0.23028758666497792, "rouge2_recall_stderr": 0.0022756293438486903, "rougeL_fmeasure": 0.3303919839118561, "rougeL_fmeasure_stderr": 0.001783388701966232, "rougeL_precision": 0.33426681511660167, "rougeL_precision_stderr": 0.002069523940604892, "rougeL_recall": 0.35354783483337526, "rougeL_recall_stderr": 0.0024228991972543336, "rougeLsum_fmeasure": 0.378239342272828, "rougeLsum_fmeasure_stderr": 0.002043462221499771, "rougeLsum_precision": 0.3827360863233357, "rougeLsum_precision_stderr": 0.002359678766192712, "rougeLsum_recall": 0.40399025031746394, "rougeLsum_recall_stderr": 0.002710450542079631}}, "4": {"generate_text_restaurant": {"bleu": 12.44085622955195, "bleu_stderr": 0.13067375364343567, "rouge1_fmeasure": 0.4511875093908521, "rouge1_fmeasure_stderr": 0.001948885960105966, "rouge1_precision": 0.45479182860495637, "rouge1_precision_stderr": 0.0023107278825576553, "rouge1_recall": 0.48017725960351876, "rouge1_recall_stderr": 0.0027316537664281945, "rouge2_fmeasure": 0.21707781052843164, "rouge2_fmeasure_stderr": 0.00185164014349205, "rouge2_precision": 0.21833473257724795, "rouge2_precision_stderr": 0.0019502917664114363, "rouge2_recall": 0.23333710855434647, "rouge2_recall_stderr": 0.002269432186215575, "rougeL_fmeasure": 0.33434874026181494, "rougeL_fmeasure_stderr": 0.0017786642192651214, "rougeL_precision": 0.3366370287941819, "rougeL_precision_stderr": 0.0019965287497770777, "rougeL_recall": 0.35684028344910274, "rougeL_recall_stderr": 0.0024057773390550254, "rougeLsum_fmeasure": 0.38297835064075053, "rougeLsum_fmeasure_stderr": 0.0020438066934495387, "rougeLsum_precision": 0.38571523251054607, "rougeLsum_precision_stderr": 0.0022865314323742805, "rougeLsum_recall": 0.40803304829345277, "rougeLsum_recall_stderr": 0.0027038368009947755}}, "5": {"generate_text_restaurant": {"bleu": 12.352678257477072, "bleu_stderr": 0.1229858870702429, "rouge1_fmeasure": 0.4520495564262539, "rouge1_fmeasure_stderr": 0.0019091091728356577, "rouge1_precision": 0.4536679355694742, "rouge1_precision_stderr": 0.002282805690194143, "rouge1_recall": 0.48257839264210245, "rouge1_recall_stderr": 0.002694452362144022, "rouge2_fmeasure": 0.21807002700335165, "rouge2_fmeasure_stderr": 0.0018243298667958761, "rouge2_precision": 0.2184937204533077, "rouge2_precision_stderr": 0.0019084340007320115, "rouge2_recall": 0.2348008779716612, "rouge2_recall_stderr": 0.0022303214323899125, "rougeL_fmeasure": 0.336313193439231, "rougeL_fmeasure_stderr": 0.001791714678974818, "rougeL_precision": 0.33692930278278704, "rougeL_precision_stderr": 0.0019923203298548866, "rougeL_recall": 0.36002100624003847, "rougeL_recall_stderr": 0.0024053149539646364, "rougeLsum_fmeasure": 0.38490335194628533, "rougeLsum_fmeasure_stderr": 0.002018489316229454, "rougeLsum_precision": 0.38599958036488224, "rougeLsum_precision_stderr": 0.002272531904720447, "rougeLsum_recall": 0.4113378384062774, "rougeLsum_recall_stderr": 0.002667546056299814}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.194933977793404, "bleu_stderr": 0.09650169688197711, "rouge1_fmeasure": 0.2157374708631877, "rouge1_fmeasure_stderr": 0.002508783327476019, "rouge1_precision": 0.15954668519186138, "rouge1_precision_stderr": 0.002100617390256336, "rouge1_recall": 0.36309739763041776, "rouge1_recall_stderr": 0.004368615445871372, "rouge2_fmeasure": 0.05279448511938101, "rouge2_fmeasure_stderr": 0.001664044690700437, "rouge2_precision": 0.03857312620535916, "rouge2_precision_stderr": 0.0012905906114606838, "rouge2_recall": 0.09177566784631346, "rouge2_recall_stderr": 0.0029866334470645636, "rougeL_fmeasure": 0.1604319788556621, "rougeL_fmeasure_stderr": 0.0019556284658093183, "rougeL_precision": 0.11839547499241576, "rougeL_precision_stderr": 0.001646949604960197, "rougeL_recall": 0.27198898577078495, "rougeL_recall_stderr": 0.0035222426961979128, "rougeLsum_fmeasure": 0.1697976651408494, "rougeLsum_fmeasure_stderr": 0.002135233558113895, "rougeLsum_precision": 0.1250746587410693, "rougeLsum_precision_stderr": 0.0017464873888176696, "rougeLsum_recall": 0.28828909491111, "rougeLsum_recall_stderr": 0.0038785736556512756}}, "1": {"article_DOC_summary": {"bleu": 1.638275312642142, "bleu_stderr": 0.11197811159644402, "rouge1_fmeasure": 0.18261422772647087, "rouge1_fmeasure_stderr": 0.00258517997013667, "rouge1_precision": 0.130085376048313, "rouge1_precision_stderr": 0.0019273226219467348, "rouge1_recall": 0.31899443410148204, "rouge1_recall_stderr": 0.004375864955378675, "rouge2_fmeasure": 0.03829439148207006, "rouge2_fmeasure_stderr": 0.0015268296315399182, "rouge2_precision": 0.027035766317831913, "rouge2_precision_stderr": 0.0010834329723279196, "rouge2_recall": 0.06843246331973844, "rouge2_recall_stderr": 0.002783755254166952, "rougeL_fmeasure": 0.13872448607191418, "rougeL_fmeasure_stderr": 0.0019214547149178605, "rougeL_precision": 0.09860861520075329, "rougeL_precision_stderr": 0.0014180838773530536, "rougeL_recall": 0.2439616017337858, "rougeL_recall_stderr": 0.003388978319546548, "rougeLsum_fmeasure": 0.14673205236069722, "rougeLsum_fmeasure_stderr": 0.002156691093638724, "rougeLsum_precision": 0.10431817474650688, "rougeLsum_precision_stderr": 0.0015880489605249845, "rougeLsum_recall": 0.25778306821268704, "rougeLsum_recall_stderr": 0.0037812899736781526}}, "2": {"article_DOC_summary": {"bleu": 2.0210914476360786, "bleu_stderr": 0.12454216962651855, "rouge1_fmeasure": 0.1978001315070318, "rouge1_fmeasure_stderr": 0.002669936318830053, "rouge1_precision": 0.14059986141007405, "rouge1_precision_stderr": 0.001980010991959053, "rouge1_recall": 0.34704590818514364, "rouge1_recall_stderr": 0.004609783640663682, "rouge2_fmeasure": 0.04760922765448965, "rouge2_fmeasure_stderr": 0.001702755511678876, "rouge2_precision": 0.033448694107769246, "rouge2_precision_stderr": 0.0011984145262489562, "rouge2_recall": 0.08605872654054372, "rouge2_recall_stderr": 0.003191665944600775, "rougeL_fmeasure": 0.1511855886298728, "rougeL_fmeasure_stderr": 0.0020261890172370246, "rougeL_precision": 0.10721276455620249, "rougeL_precision_stderr": 0.0014796729903083395, "rougeL_recall": 0.26727650609085896, "rougeL_recall_stderr": 0.0036995069904560843, "rougeLsum_fmeasure": 0.16004925294103853, "rougeLsum_fmeasure_stderr": 0.0022364087298009884, "rougeLsum_precision": 0.11346372550782761, "rougeLsum_precision_stderr": 0.0016303825401644061, "rougeLsum_recall": 0.28288787023938955, "rougeLsum_recall_stderr": 0.004038388892533831}}, "3": {"article_DOC_summary": {"bleu": 2.03791695157546, "bleu_stderr": 0.1342348737065576, "rouge1_fmeasure": 0.19482488018157684, "rouge1_fmeasure_stderr": 0.0028449440307477085, "rouge1_precision": 0.14077378985561326, "rouge1_precision_stderr": 0.002167627071465062, "rouge1_recall": 0.3365881713882994, "rouge1_recall_stderr": 0.004904093832075129, "rouge2_fmeasure": 0.04702347182983231, "rouge2_fmeasure_stderr": 0.001623530160402657, "rouge2_precision": 0.03346907677060051, "rouge2_precision_stderr": 0.00115993469618386, "rouge2_recall": 0.08385926778368435, "rouge2_recall_stderr": 0.0030304591639245464, "rougeL_fmeasure": 0.14665273300314152, "rougeL_fmeasure_stderr": 0.00212580713201847, "rougeL_precision": 0.10579758272401656, "rougeL_precision_stderr": 0.0016080916023179224, "rougeL_recall": 0.25509688514423123, "rougeL_recall_stderr": 0.003826588022233898, "rougeLsum_fmeasure": 0.15621401458929957, "rougeLsum_fmeasure_stderr": 0.00234420229722605, "rougeLsum_precision": 0.11263878026618175, "rougeLsum_precision_stderr": 0.00176128619109869, "rougeLsum_recall": 0.2715816804884895, "rougeLsum_recall_stderr": 0.004199504898315826}}, "4": {"article_DOC_summary": {"bleu": 0.9907623400783704, "bleu_stderr": 0.12144441541342028, "rouge1_fmeasure": 0.05341124486563334, "rouge1_fmeasure_stderr": 0.0029118807193791816, "rouge1_precision": 0.04332722979741195, "rouge1_precision_stderr": 0.0025397484135024503, "rouge1_recall": 0.08638484320409213, "rouge1_recall_stderr": 0.004855712915740289, "rouge2_fmeasure": 0.0126600494371876, "rouge2_fmeasure_stderr": 0.0010393927624132967, "rouge2_precision": 0.009371982127377696, "rouge2_precision_stderr": 0.0007761604812612375, "rouge2_recall": 0.02197790550576286, "rouge2_recall_stderr": 0.0018737167719494205, "rougeL_fmeasure": 0.04031062592059575, "rougeL_fmeasure_stderr": 0.0022065055222514843, "rougeL_precision": 0.03308111037957417, "rougeL_precision_stderr": 0.002037483672167313, "rougeL_recall": 0.06547067873377617, "rougeL_recall_stderr": 0.0037402890017668913, "rougeLsum_fmeasure": 0.04344922023460101, "rougeLsum_fmeasure_stderr": 0.002384615926685988, "rougeLsum_precision": 0.03547102946933422, "rougeLsum_precision_stderr": 0.002145817702406172, "rougeLsum_recall": 0.07066149952031933, "rougeLsum_recall_stderr": 0.004048133700252855}}, "5": {"article_DOC_summary": {"bleu": 1.959625061740418e-19, "bleu_stderr": 1.1984703371139495e-15, "rouge1_fmeasure": 0.0026267053439145405, "rouge1_fmeasure_stderr": 0.0006851925438686376, "rouge1_precision": 0.002432491256372859, "rouge1_precision_stderr": 0.0006531802890059347, "rouge1_recall": 0.0035688165030904754, "rouge1_recall_stderr": 0.000969634171191206, "rouge2_fmeasure": 0.0003000575858304317, "rouge2_fmeasure_stderr": 0.00012455541465929192, "rouge2_precision": 0.0002462802129838755, "rouge2_precision_stderr": 0.00010248150275091556, "rouge2_recall": 0.00045873937383371344, "rouge2_recall_stderr": 0.0002027676865696476, "rougeL_fmeasure": 0.001963800127619678, "rougeL_fmeasure_stderr": 0.0005077215919641312, "rougeL_precision": 0.001824459823195743, "rougeL_precision_stderr": 0.000493813415204384, "rougeL_recall": 0.002706254470313614, "rougeL_recall_stderr": 0.0007319861753860738, "rougeLsum_fmeasure": 0.002094899399917077, "rougeLsum_fmeasure_stderr": 0.0005500941719652507, "rougeLsum_precision": 0.001947339725356528, "rougeLsum_precision_stderr": 0.0005292516418451348, "rougeLsum_recall": 0.0028418243363945105, "rougeLsum_recall_stderr": 0.0007704086076680828}}}}
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.19198287608229725,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0021092410953077365
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.32589953688933776,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002753157025629824
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.22344837173188767,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0018933971382747686
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.048137852198349124,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0010124293758406351
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.08482162860321114,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0017481381213950958
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.05600464749019431,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.001035304930307184
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.13481580311964722,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0014071194011638184
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.23733416551629913,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.00219502068666546
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.15854970566001372,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0012774189115518982
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.18117711700554173,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0019909544088953742
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.30882558318305736,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0026454321668364913
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.21107970658856343,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0017866890110014018
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 3.013745331324814,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.06592229302797037
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.16688833624223554,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0024001395431129838
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.27404204769144364,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.003375962083519381
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.1883894874399809,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0022518375274135087
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.04136653299690803,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0010174651332227344
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.07172155304298297,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0017383368313820028
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.04708204631655152,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0010069752068567668
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.11810699062222797,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.001714836740267086
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.19959382723616914,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.002605476332476844
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.13391288059874437,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.001564486351392238
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.15789252138792423,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0022764600108588808
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.26023963410461465,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.003240567155894993
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.17834159419543702,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.002133544799751138
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 2.986263870634216,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.0869488541072268
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.05571162128505982,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0019790843122430768
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.09490155511879607,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.003216109108055619
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.06229352605061771,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.002059973094043126
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.013746904154847127,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.000658207908545434
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.02605352352304377,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0013317962968417952
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.01596233535299914,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0007332643966253175
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.04028867827357096,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0014354449068175981
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.07013160574213247,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0024380518058086703
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.04502172063067562,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0014806563009305842
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.0526446584524354,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0018735818695187819
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.08977749314111799,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0030541817569144
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.05884434373920439,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0019486127272992522
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 0.7653537900871676,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.049323102026813614
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.009989872401580602,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0009952828000827124
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.015276156161555617,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.001444671711873694
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.010262809104423124,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.000941433577191994
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.0024167502513949756,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.00033798462070527304
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.004213844763908219,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0006183240014712593
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.0025848086028049965,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0003301678198930521
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.007350732687047713,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.000736612195363616
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.011512337859684543,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0011311946838375108
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.007530769725575652,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0006918159097560924
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.009401823549618816,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0009477935776177291
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.014304994804442096,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0013529102153531946
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.009601062566883777,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0008823539930029628
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 2.313122283850011e-06,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 4.438365983452618e-06
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.14077378985561326,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.002167627071465062
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.3365881713882994,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004904093832075129
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.19482488018157684,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0028449440307477085
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.03346907677060051,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.00115993469618386
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.08385926778368435,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0030304591639245464
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.04702347182983231,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.001623530160402657
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.10579758272401656,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0016080916023179224
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.25509688514423123,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.003826588022233898
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.14665273300314152,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.00212580713201847
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.11263878026618175,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.00176128619109869
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.2715816804884895,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.004199504898315826
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.15621401458929957,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.00234420229722605
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 2.03791695157546,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.1342348737065576
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.04332722979741195,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0025397484135024503
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.08638484320409213,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004855712915740289
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.05341124486563334,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0029118807193791816
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.009371982127377696,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0007761604812612375
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.02197790550576286,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0018737167719494205
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.0126600494371876,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0010393927624132967
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.03308111037957417,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.002037483672167313
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.06547067873377617,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0037402890017668913
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.04031062592059575,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0022065055222514843
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.03547102946933422,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.002145817702406172
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.07066149952031933,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.004048133700252855
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.04344922023460101,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.002384615926685988
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.9907623400783704,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.12144441541342028
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.002432491256372859,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0006531802890059347
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.0035688165030904754,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.000969634171191206
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.0026267053439145405,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0006851925438686376
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.0002462802129838755,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.00010248150275091556
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.00045873937383371344,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0002027676865696476
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.0003000575858304317,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.00012455541465929192
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.001824459823195743,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.000493813415204384
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.002706254470313614,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0007319861753860738
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.001963800127619678,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0005077215919641312
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.001947339725356528,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0005292516418451348
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.0028418243363945105,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0007704086076680828
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.002094899399917077,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0005500941719652507
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.959625061740418e-19,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 1.1984703371139495e-15
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.35,0.01509065034144423,0
3
+ anli_r2,acc,0.346,0.015050266127564448,0
4
+ anli_r3,acc,0.355,0.013819249004047296,0
5
+ arc_challenge,acc,0.29180887372013653,0.013284525292403496,0
6
+ arc_challenge,acc_norm,0.3216723549488055,0.013650488084494162,0
7
+ arc_easy,acc,0.6220538720538721,0.009949405744045469,0
8
+ arc_easy,acc_norm,0.5395622895622896,0.010227616386289017,0
9
+ boolq,acc,0.6376146788990825,0.008407308655864048,1
10
+ cb,acc,0.26785714285714285,0.05971290310957636,1
11
+ cb,f1,0.2374338624338624,,1
12
+ copa,acc,0.81,0.03942772444036623,0
13
+ hellaswag,acc,0.5215096594303924,0.004985162074336112,0
14
+ hellaswag,acc_norm,0.6843258315076678,0.004638339207348913,0
15
+ piqa,acc,0.7627856365614799,0.009924694933586364,0
16
+ piqa,acc_norm,0.7747551686615887,0.00974664347103214,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.87,0.010640169792499361,0
19
+ sciq,acc_norm,0.807,0.012486268734370145,0
20
+ storycloze_2016,acc,0.7455905932656334,0.010071542492663043,0
21
+ winogrande,acc,0.5659037095501184,0.013929882555694054,0
8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-07-19_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.35,
5
- "acc_stderr": 0.01509065034144423
6
- },
7
- "anli_r2": {
8
- "acc": 0.346,
9
- "acc_stderr": 0.015050266127564448
10
- },
11
- "anli_r3": {
12
- "acc": 0.355,
13
- "acc_stderr": 0.013819249004047296
14
- },
15
- "cb": {
16
- "acc": 0.26785714285714285,
17
- "acc_stderr": 0.05971290310957636,
18
- "f1": 0.2374338624338624
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.5215096594303924,
26
- "acc_stderr": 0.004985162074336112,
27
- "acc_norm": 0.6843258315076678,
28
- "acc_norm_stderr": 0.004638339207348913
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5659037095501184,
36
- "acc_stderr": 0.013929882555694054
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7455905932656334,
40
- "acc_stderr": 0.010071542492663043
41
- },
42
- "boolq": {
43
- "acc": 0.6376146788990825,
44
- "acc_stderr": 0.008407308655864048
45
- },
46
- "arc_easy": {
47
- "acc": 0.6220538720538721,
48
- "acc_stderr": 0.009949405744045469,
49
- "acc_norm": 0.5395622895622896,
50
- "acc_norm_stderr": 0.010227616386289017
51
- },
52
- "arc_challenge": {
53
- "acc": 0.29180887372013653,
54
- "acc_stderr": 0.013284525292403496,
55
- "acc_norm": 0.3216723549488055,
56
- "acc_norm_stderr": 0.013650488084494162
57
- },
58
- "sciq": {
59
- "acc": 0.87,
60
- "acc_stderr": 0.010640169792499361,
61
- "acc_norm": 0.807,
62
- "acc_norm_stderr": 0.012486268734370145
63
- },
64
- "piqa": {
65
- "acc": 0.7627856365614799,
66
- "acc_stderr": 0.009924694933586364,
67
- "acc_norm": 0.7747551686615887,
68
- "acc_norm_stderr": 0.00974664347103214
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.328,0.014853842487270334,0
3
+ anli_r2,acc,0.316,0.01470919305605713,0
4
+ anli_r3,acc,0.3591666666666667,0.013855141559780364,0
5
+ arc_challenge,acc,0.3054607508532423,0.0134600804780025,0
6
+ arc_challenge,acc_norm,0.3319112627986348,0.013760988200880538,0
7
+ arc_easy,acc,0.6422558922558923,0.009835772757343361,0
8
+ arc_easy,acc_norm,0.6035353535353535,0.010037412763064529,0
9
+ boolq,acc,0.6477064220183486,0.00835476049390613,1
10
+ cb,acc,0.26785714285714285,0.05971290310957635,1
11
+ cb,f1,0.21294539321104786,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.5201155148376817,0.004985741706385719,0
14
+ hellaswag,acc_norm,0.6825333598884684,0.004645393477680675,0
15
+ piqa,acc,0.764961915125136,0.009893146688805326,0
16
+ piqa,acc_norm,0.7725788900979326,0.009779850767847232,0
17
+ rte,acc,0.4657039711191336,0.030025579819366426,0
18
+ sciq,acc,0.905,0.009276910103103329,0
19
+ sciq,acc_norm,0.88,0.0102813280127474,0
20
+ storycloze_2016,acc,0.7365045430251203,0.010187168219156485,0
21
+ winogrande,acc,0.5872138910812944,0.013837060648682103,0
8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-07-19_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.328,
5
- "acc_stderr": 0.014853842487270334
6
- },
7
- "anli_r2": {
8
- "acc": 0.316,
9
- "acc_stderr": 0.01470919305605713
10
- },
11
- "anli_r3": {
12
- "acc": 0.3591666666666667,
13
- "acc_stderr": 0.013855141559780364
14
- },
15
- "cb": {
16
- "acc": 0.26785714285714285,
17
- "acc_stderr": 0.05971290310957635,
18
- "f1": 0.21294539321104786
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.5201155148376817,
26
- "acc_stderr": 0.004985741706385719,
27
- "acc_norm": 0.6825333598884684,
28
- "acc_norm_stderr": 0.004645393477680675
29
- },
30
- "rte": {
31
- "acc": 0.4657039711191336,
32
- "acc_stderr": 0.030025579819366426
33
- },
34
- "winogrande": {
35
- "acc": 0.5872138910812944,
36
- "acc_stderr": 0.013837060648682103
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7365045430251203,
40
- "acc_stderr": 0.010187168219156485
41
- },
42
- "boolq": {
43
- "acc": 0.6477064220183486,
44
- "acc_stderr": 0.00835476049390613
45
- },
46
- "arc_easy": {
47
- "acc": 0.6422558922558923,
48
- "acc_stderr": 0.009835772757343361,
49
- "acc_norm": 0.6035353535353535,
50
- "acc_norm_stderr": 0.010037412763064529
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3054607508532423,
54
- "acc_stderr": 0.0134600804780025,
55
- "acc_norm": 0.3319112627986348,
56
- "acc_norm_stderr": 0.013760988200880538
57
- },
58
- "sciq": {
59
- "acc": 0.905,
60
- "acc_stderr": 0.009276910103103329,
61
- "acc_norm": 0.88,
62
- "acc_norm_stderr": 0.0102813280127474
63
- },
64
- "piqa": {
65
- "acc": 0.764961915125136,
66
- "acc_stderr": 0.009893146688805326,
67
- "acc_norm": 0.7725788900979326,
68
- "acc_norm_stderr": 0.009779850767847232
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.014944140233795018,0
3
+ anli_r2,acc,0.329,0.01486539538592835,0
4
+ anli_r3,acc,0.35083333333333333,0.013782212417178195,0
5
+ arc_challenge,acc,0.310580204778157,0.013522292098053059,0
6
+ arc_challenge,acc_norm,0.33532423208191126,0.013796182947785562,0
7
+ arc_easy,acc,0.6481481481481481,0.009799078929868706,0
8
+ arc_easy,acc_norm,0.6212121212121212,0.00995373765654204,0
9
+ boolq,acc,0.6382262996941896,0.008404238796949254,1
10
+ cb,acc,0.19642857142857142,0.05357142857142858,1
11
+ cb,f1,0.1668300653594771,,1
12
+ copa,acc,0.78,0.04163331998932263,0
13
+ hellaswag,acc,0.5200159330810595,0.004985781620467012,0
14
+ hellaswag,acc_norm,0.6863174666401115,0.004630407476835209,0
15
+ piqa,acc,0.7573449401523396,0.0100020025697087,0
16
+ piqa,acc_norm,0.764961915125136,0.009893146688805319,0
17
+ rte,acc,0.48014440433212996,0.0300727231673172,0
18
+ sciq,acc,0.913,0.00891686663074591,0
19
+ sciq,acc_norm,0.889,0.009938701010583726,0
20
+ storycloze_2016,acc,0.743452699091395,0.01009926092771917,0
21
+ winogrande,acc,0.5864246250986582,0.013840971763195306,0
8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-07-19_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.014944140233795018
6
- },
7
- "anli_r2": {
8
- "acc": 0.329,
9
- "acc_stderr": 0.01486539538592835
10
- },
11
- "anli_r3": {
12
- "acc": 0.35083333333333333,
13
- "acc_stderr": 0.013782212417178195
14
- },
15
- "cb": {
16
- "acc": 0.19642857142857142,
17
- "acc_stderr": 0.05357142857142858,
18
- "f1": 0.1668300653594771
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932263
23
- },
24
- "hellaswag": {
25
- "acc": 0.5200159330810595,
26
- "acc_stderr": 0.004985781620467012,
27
- "acc_norm": 0.6863174666401115,
28
- "acc_norm_stderr": 0.004630407476835209
29
- },
30
- "rte": {
31
- "acc": 0.48014440433212996,
32
- "acc_stderr": 0.0300727231673172
33
- },
34
- "winogrande": {
35
- "acc": 0.5864246250986582,
36
- "acc_stderr": 0.013840971763195306
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.743452699091395,
40
- "acc_stderr": 0.01009926092771917
41
- },
42
- "boolq": {
43
- "acc": 0.6382262996941896,
44
- "acc_stderr": 0.008404238796949254
45
- },
46
- "arc_easy": {
47
- "acc": 0.6481481481481481,
48
- "acc_stderr": 0.009799078929868706,
49
- "acc_norm": 0.6212121212121212,
50
- "acc_norm_stderr": 0.00995373765654204
51
- },
52
- "arc_challenge": {
53
- "acc": 0.310580204778157,
54
- "acc_stderr": 0.013522292098053059,
55
- "acc_norm": 0.33532423208191126,
56
- "acc_norm_stderr": 0.013796182947785562
57
- },
58
- "sciq": {
59
- "acc": 0.913,
60
- "acc_stderr": 0.00891686663074591,
61
- "acc_norm": 0.889,
62
- "acc_norm_stderr": 0.009938701010583726
63
- },
64
- "piqa": {
65
- "acc": 0.7573449401523396,
66
- "acc_stderr": 0.0100020025697087,
67
- "acc_norm": 0.764961915125136,
68
- "acc_norm_stderr": 0.009893146688805319
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.348,0.01507060460376841,0
3
+ anli_r2,acc,0.344,0.015029633724408947,0
4
+ anli_r3,acc,0.3325,0.013605417345710528,0
5
+ arc_challenge,acc,0.3148464163822526,0.01357265770308495,0
6
+ arc_challenge,acc_norm,0.3250853242320819,0.013688147309729119,0
7
+ arc_easy,acc,0.648989898989899,0.009793703885101042,0
8
+ arc_easy,acc_norm,0.6393097643097643,0.009853512108416748,0
9
+ boolq,acc,0.6363914373088685,0.008413404209789989,1
10
+ cb,acc,0.3392857142857143,0.06384226561930825,1
11
+ cb,f1,0.3185837135128588,,1
12
+ copa,acc,0.78,0.04163331998932263,0
13
+ hellaswag,acc,0.5206134236207927,0.00498553915978342,0
14
+ hellaswag,acc_norm,0.6902011551483768,0.00461465517501001,0
15
+ piqa,acc,0.7584330794341676,0.009986718001804467,0
16
+ piqa,acc_norm,0.7665941240478781,0.009869247889520991,0
17
+ rte,acc,0.48014440433212996,0.0300727231673172,0
18
+ sciq,acc,0.917,0.008728527206074792,0
19
+ sciq,acc_norm,0.911,0.009008893392651521,0
20
+ storycloze_2016,acc,0.7504008551576697,0.010008002459430848,0
21
+ winogrande,acc,0.601420678768745,0.01376035717687383,0
8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-07-25_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.348,
5
- "acc_stderr": 0.01507060460376841
6
- },
7
- "anli_r2": {
8
- "acc": 0.344,
9
- "acc_stderr": 0.015029633724408947
10
- },
11
- "anli_r3": {
12
- "acc": 0.3325,
13
- "acc_stderr": 0.013605417345710528
14
- },
15
- "cb": {
16
- "acc": 0.3392857142857143,
17
- "acc_stderr": 0.06384226561930825,
18
- "f1": 0.3185837135128588
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932263
23
- },
24
- "hellaswag": {
25
- "acc": 0.5206134236207927,
26
- "acc_stderr": 0.00498553915978342,
27
- "acc_norm": 0.6902011551483768,
28
- "acc_norm_stderr": 0.00461465517501001
29
- },
30
- "rte": {
31
- "acc": 0.48014440433212996,
32
- "acc_stderr": 0.0300727231673172
33
- },
34
- "winogrande": {
35
- "acc": 0.601420678768745,
36
- "acc_stderr": 0.01376035717687383
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7504008551576697,
40
- "acc_stderr": 0.010008002459430848
41
- },
42
- "boolq": {
43
- "acc": 0.6363914373088685,
44
- "acc_stderr": 0.008413404209789989
45
- },
46
- "arc_easy": {
47
- "acc": 0.648989898989899,
48
- "acc_stderr": 0.009793703885101042,
49
- "acc_norm": 0.6393097643097643,
50
- "acc_norm_stderr": 0.009853512108416748
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3148464163822526,
54
- "acc_stderr": 0.01357265770308495,
55
- "acc_norm": 0.3250853242320819,
56
- "acc_norm_stderr": 0.013688147309729119
57
- },
58
- "sciq": {
59
- "acc": 0.917,
60
- "acc_stderr": 0.008728527206074792,
61
- "acc_norm": 0.911,
62
- "acc_norm_stderr": 0.009008893392651521
63
- },
64
- "piqa": {
65
- "acc": 0.7584330794341676,
66
- "acc_stderr": 0.009986718001804467,
67
- "acc_norm": 0.7665941240478781,
68
- "acc_norm_stderr": 0.009869247889520991
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.344,0.015029633724408947,0
3
+ anli_r2,acc,0.345,0.015039986742055238,0
4
+ anli_r3,acc,0.34833333333333333,0.013759437498874072,0
5
+ arc_challenge,acc,0.3199658703071672,0.013631345807016196,0
6
+ arc_challenge,acc_norm,0.3447098976109215,0.01388881628678211,0
7
+ arc_easy,acc,0.6632996632996633,0.009697166595752475,0
8
+ arc_easy,acc_norm,0.6447811447811448,0.009820245899287124,0
9
+ boolq,acc,0.636085626911315,0.00841491890912884,1
10
+ cb,acc,0.21428571428571427,0.055328333517248834,1
11
+ cb,f1,0.1997113997113997,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.5206134236207927,0.004985539159783419,0
14
+ hellaswag,acc_norm,0.6900019916351324,0.004615472210316043,0
15
+ piqa,acc,0.7665941240478781,0.009869247889521007,0
16
+ piqa,acc_norm,0.7682263329706203,0.00984514377279404,0
17
+ rte,acc,0.48736462093862815,0.030086851767188564,0
18
+ sciq,acc,0.927,0.008230354715244062,0
19
+ sciq,acc_norm,0.908,0.009144376393151108,0
20
+ storycloze_2016,acc,0.7509353287012293,0.010000841162740146,0
21
+ winogrande,acc,0.6077348066298343,0.013722400462000888,0
8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-06-46_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.344,
5
- "acc_stderr": 0.015029633724408947
6
- },
7
- "anli_r2": {
8
- "acc": 0.345,
9
- "acc_stderr": 0.015039986742055238
10
- },
11
- "anli_r3": {
12
- "acc": 0.34833333333333333,
13
- "acc_stderr": 0.013759437498874072
14
- },
15
- "cb": {
16
- "acc": 0.21428571428571427,
17
- "acc_stderr": 0.055328333517248834,
18
- "f1": 0.1997113997113997
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.5206134236207927,
26
- "acc_stderr": 0.004985539159783419,
27
- "acc_norm": 0.6900019916351324,
28
- "acc_norm_stderr": 0.004615472210316043
29
- },
30
- "rte": {
31
- "acc": 0.48736462093862815,
32
- "acc_stderr": 0.030086851767188564
33
- },
34
- "winogrande": {
35
- "acc": 0.6077348066298343,
36
- "acc_stderr": 0.013722400462000888
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7509353287012293,
40
- "acc_stderr": 0.010000841162740146
41
- },
42
- "boolq": {
43
- "acc": 0.636085626911315,
44
- "acc_stderr": 0.00841491890912884
45
- },
46
- "arc_easy": {
47
- "acc": 0.6632996632996633,
48
- "acc_stderr": 0.009697166595752475,
49
- "acc_norm": 0.6447811447811448,
50
- "acc_norm_stderr": 0.009820245899287124
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3199658703071672,
54
- "acc_stderr": 0.013631345807016196,
55
- "acc_norm": 0.3447098976109215,
56
- "acc_norm_stderr": 0.01388881628678211
57
- },
58
- "sciq": {
59
- "acc": 0.927,
60
- "acc_stderr": 0.008230354715244062,
61
- "acc_norm": 0.908,
62
- "acc_norm_stderr": 0.009144376393151108
63
- },
64
- "piqa": {
65
- "acc": 0.7665941240478781,
66
- "acc_stderr": 0.009869247889521007,
67
- "acc_norm": 0.7682263329706203,
68
- "acc_norm_stderr": 0.00984514377279404
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.34,0.014987482264363935,0
3
+ anli_r2,acc,0.337,0.014955087918653605,0
4
+ anli_r3,acc,0.3325,0.013605417345710526,0
5
+ arc_challenge,acc,0.3310580204778157,0.013752062419817834,0
6
+ arc_challenge,acc_norm,0.3447098976109215,0.013888816286782112,0
7
+ arc_easy,acc,0.6536195286195287,0.009763542075695738,0
8
+ arc_easy,acc_norm,0.6401515151515151,0.009848484848484836,0
9
+ boolq,acc,0.6409785932721712,0.008390241754319908,1
10
+ cb,acc,0.23214285714285715,0.0569293902400011,1
11
+ cb,f1,0.223351041141572,,1
12
+ copa,acc,0.76,0.042923469599092816,0
13
+ hellaswag,acc,0.5209121688906593,0.004985415250690905,0
14
+ hellaswag,acc_norm,0.689205337582155,0.004618730353217064,0
15
+ piqa,acc,0.7606093579978237,0.009955884250291688,0
16
+ piqa,acc_norm,0.7747551686615887,0.009746643471032136,0
17
+ rte,acc,0.5054151624548736,0.030094698123239966,0
18
+ sciq,acc,0.923,0.008434580140240648,0
19
+ sciq,acc_norm,0.912,0.008963053962592074,0
20
+ storycloze_2016,acc,0.7498663816141101,0.010015143382536456,0
21
+ winogrande,acc,0.5935280189423836,0.01380444869775337,0
8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-46_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.34,
5
- "acc_stderr": 0.014987482264363935
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.014955087918653605
10
- },
11
- "anli_r3": {
12
- "acc": 0.3325,
13
- "acc_stderr": 0.013605417345710526
14
- },
15
- "cb": {
16
- "acc": 0.23214285714285715,
17
- "acc_stderr": 0.0569293902400011,
18
- "f1": 0.223351041141572
19
- },
20
- "copa": {
21
- "acc": 0.76,
22
- "acc_stderr": 0.042923469599092816
23
- },
24
- "hellaswag": {
25
- "acc": 0.5209121688906593,
26
- "acc_stderr": 0.004985415250690905,
27
- "acc_norm": 0.689205337582155,
28
- "acc_norm_stderr": 0.004618730353217064
29
- },
30
- "rte": {
31
- "acc": 0.5054151624548736,
32
- "acc_stderr": 0.030094698123239966
33
- },
34
- "winogrande": {
35
- "acc": 0.5935280189423836,
36
- "acc_stderr": 0.01380444869775337
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7498663816141101,
40
- "acc_stderr": 0.010015143382536456
41
- },
42
- "boolq": {
43
- "acc": 0.6409785932721712,
44
- "acc_stderr": 0.008390241754319908
45
- },
46
- "arc_easy": {
47
- "acc": 0.6536195286195287,
48
- "acc_stderr": 0.009763542075695738,
49
- "acc_norm": 0.6401515151515151,
50
- "acc_norm_stderr": 0.009848484848484836
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3310580204778157,
54
- "acc_stderr": 0.013752062419817834,
55
- "acc_norm": 0.3447098976109215,
56
- "acc_norm_stderr": 0.013888816286782112
57
- },
58
- "sciq": {
59
- "acc": 0.923,
60
- "acc_stderr": 0.008434580140240648,
61
- "acc_norm": 0.912,
62
- "acc_norm_stderr": 0.008963053962592074
63
- },
64
- "piqa": {
65
- "acc": 0.7606093579978237,
66
- "acc_stderr": 0.009955884250291688,
67
- "acc_norm": 0.7747551686615887,
68
- "acc_norm_stderr": 0.009746643471032136
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/transformers/merges.txt ADDED
File without changes
8b7178b13b/transformers/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
8b7178b13b/transformers/vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783
3
+ size 1042301
8b7178b178b/evaluation/8b7178b178b_0_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 0,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b178b/evaluation/8b7178b178b_1_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 1,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b178b/evaluation/8b7178b178b_2_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 2,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b178b/evaluation/8b7178b178b_3_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 3,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b178b/evaluation/8b7178b178b_4_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 4,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b178b/evaluation/8b7178b178b_5_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 5,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }