Muennighoff
commited on
Commit
•
b220bd1
1
Parent(s):
b101f59
Eval
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 8b7178b13b/evaluation/8b7178b13b_0_babi.json +22 -0
- 8b7178b13b/evaluation/8b7178b13b_1_babi.json +22 -0
- 8b7178b13b/evaluation/8b7178b13b_2_babi.json +22 -0
- 8b7178b13b/evaluation/8b7178b13b_3_babi.json +22 -0
- 8b7178b13b/evaluation/8b7178b13b_4_babi.json +22 -0
- 8b7178b13b/evaluation/8b7178b13b_5_babi.json +22 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json +1 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl +3 -0
- 8b7178b13b/evaluation/generation/merged.csv +53 -0
- 8b7178b13b/evaluation/generation/merged.json +1 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json +133 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-07-19_0shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-07-19_1shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-07-19_2shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-07-25_3shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-06-46_4shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-46_5shots_backup.json +0 -87
- 8b7178b13b/transformers/merges.txt +0 -0
- 8b7178b13b/transformers/tokenizer.json +0 -0
- 8b7178b13b/transformers/vocab.json +3 -0
- 8b7178b178b/evaluation/8b7178b178b_0_babi.json +22 -0
- 8b7178b178b/evaluation/8b7178b178b_1_babi.json +22 -0
- 8b7178b178b/evaluation/8b7178b178b_2_babi.json +22 -0
- 8b7178b178b/evaluation/8b7178b178b_3_babi.json +22 -0
- 8b7178b178b/evaluation/8b7178b178b_4_babi.json +22 -0
- 8b7178b178b/evaluation/8b7178b178b_5_babi.json +22 -0
8b7178b13b/evaluation/8b7178b13b_0_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 0,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/8b7178b13b_1_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 1,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/8b7178b13b_2_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 2,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/8b7178b13b_3_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 3,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/8b7178b13b_4_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 4,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/8b7178b13b_5_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 5,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.19198287608229725, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021092410953077365}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.32589953688933776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002753157025629824}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.22344837173188767, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018933971382747686}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.048137852198349124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010124293758406351}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08482162860321114, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017481381213950958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05600464749019431, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001035304930307184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.13481580311964722, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014071194011638184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.23733416551629913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00219502068666546}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15854970566001372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012774189115518982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.18117711700554173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019909544088953742}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.30882558318305736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026454321668364913}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.21107970658856343, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017866890110014018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.013745331324814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06592229302797037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.16688833624223554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024001395431129838}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.27404204769144364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003375962083519381}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1883894874399809, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022518375274135087}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04136653299690803, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010174651332227344}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07172155304298297, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017383368313820028}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04708204631655152, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010069752068567668}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11810699062222797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001714836740267086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19959382723616914, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002605476332476844}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13391288059874437, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001564486351392238}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.15789252138792423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022764600108588808}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.26023963410461465, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003240567155894993}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17834159419543702, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002133544799751138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.986263870634216, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0869488541072268}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.05571162128505982, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019790843122430768}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.09490155511879607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003216109108055619}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06229352605061771, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002059973094043126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.013746904154847127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000658207908545434}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02605352352304377, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013317962968417952}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01596233535299914, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007332643966253175}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04028867827357096, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014354449068175981}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.07013160574213247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024380518058086703}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04502172063067562, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014806563009305842}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.0526446584524354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018735818695187819}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.08977749314111799, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030541817569144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05884434373920439, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019486127272992522}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.7653537900871676, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.049323102026813614}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.009989872401580602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009952828000827124}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.015276156161555617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001444671711873694}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010262809104423124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000941433577191994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0024167502513949756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00033798462070527304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.004213844763908219, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006183240014712593}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0025848086028049965, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003301678198930521}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.007350732687047713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000736612195363616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.011512337859684543, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011311946838375108}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.007530769725575652, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006918159097560924}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.009401823549618816, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009477935776177291}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.014304994804442096, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013529102153531946}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009601062566883777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008823539930029628}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.313122283850011e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 4.438365983452618e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14077378985561326, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002167627071465062}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3365881713882994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004904093832075129}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19482488018157684, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028449440307477085}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03346907677060051, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00115993469618386}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08385926778368435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030304591639245464}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04702347182983231, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001623530160402657}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10579758272401656, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016080916023179224}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.25509688514423123, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003826588022233898}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14665273300314152, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00212580713201847}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11263878026618175, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00176128619109869}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2715816804884895, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004199504898315826}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15621401458929957, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00234420229722605}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.03791695157546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1342348737065576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.04332722979741195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025397484135024503}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.08638484320409213, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004855712915740289}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05341124486563334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029118807193791816}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.009371982127377696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007761604812612375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.02197790550576286, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018737167719494205}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0126600494371876, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010393927624132967}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03308111037957417, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002037483672167313}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06547067873377617, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037402890017668913}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04031062592059575, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022065055222514843}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03547102946933422, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002145817702406172}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.07066149952031933, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004048133700252855}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04344922023460101, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002384615926685988}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.9907623400783704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12144441541342028}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002432491256372859, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006531802890059347}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0035688165030904754, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.000969634171191206}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0026267053439145405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006851925438686376}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0002462802129838755, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00010248150275091556}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00045873937383371344, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002027676865696476}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0003000575858304317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00012455541465929192}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.001824459823195743, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000493813415204384}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.002706254470313614, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007319861753860738}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001963800127619678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005077215919641312}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.001947339725356528, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005292516418451348}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0028418243363945105, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007704086076680828}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.002094899399917077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005500941719652507}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.959625061740418e-19, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.1984703371139495e-15}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e1dbeb121b2aa6b21362cc27760dfac9df5f0c4392de6dc66b55afa2d2ddcdb
|
3 |
+
size 18917037
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9170e4bf6e70cb0474e5cc61a22d09edf230e43fbfc24971c3ca80e1f9303ace
|
3 |
+
size 24337079
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7121ccb97fa1421eb5d54fd4c30f575c47407e8e1d22029847bd1bc796605b98
|
3 |
+
size 29478626
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d4f84f2bf12c2c7f4343c7a75d12d4f3b26590608f8e73f6feb6f4760f1adf8
|
3 |
+
size 34801616
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fbbd5b38d4479e236de19fb18b0426fc754c283c908c6551a1c23132a8bec707
|
3 |
+
size 9647191
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:efb257030811a985c3b288620cf2719b5f2f7b4cf73388bcdb5057bd80ab93d9
|
3 |
+
size 11673906
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc87da448769f33c6956c2aade6eb6ad69e8c8db403d47f83be5244751597f78
|
3 |
+
size 13899064
|
8b7178b13b/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.013098749477153882
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.013098749477153882
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.17675735102863532
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.17675735102863532
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.20791813280930055
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.20791813280930055
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2138339925008178
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2138339925008178
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.21707781052843164
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.21707781052843164
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.21807002700335165
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.21807002700335165
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.17445934389128182
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05279448511938101
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.05279448511938101
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03829439148207006
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.03829439148207006
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04760922765448965
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.04760922765448965
|
21 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04702347182983231
|
22 |
+
gem_xsum,3,median,rouge2_fmeasure,0.04702347182983231
|
23 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0126600494371876
|
24 |
+
gem_xsum,4,median,rouge2_fmeasure,0.0126600494371876
|
25 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003000575858304317
|
26 |
+
gem_xsum,5,median,rouge2_fmeasure,0.0003000575858304317
|
27 |
+
gem_xsum,5,average,multiple,0.033113613851465176
|
28 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.051001464150726095
|
29 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.051001464150726095
|
30 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05455324665630359
|
31 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.05455324665630359
|
32 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.055583861631680276
|
33 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.055583861631680276
|
34 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05769919253564089
|
35 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.05769919253564089
|
36 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.06050302495157815
|
37 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.06050302495157815
|
38 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.06252454636994373
|
39 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.06252454636994373
|
40 |
+
web_nlg_en,5,average,multiple,0.05697755604931212
|
41 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04191366659253858
|
42 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.04191366659253858
|
43 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04827166271383884
|
44 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.04827166271383884
|
45 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05600464749019431
|
46 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.05600464749019431
|
47 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04708204631655152
|
48 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.04708204631655152
|
49 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01596233535299914
|
50 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.01596233535299914
|
51 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0025848086028049965
|
52 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.0025848086028049965
|
53 |
+
wiki_lingua_en,5,average,multiple,0.0353031945114879
|
8b7178b13b/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4373157998206226, "bleu_stderr": 0.056741486001354216, "rouge1_fmeasure": 0.10947380559420297, "rouge1_fmeasure_stderr": 0.0022511546576554292, "rouge1_precision": 0.07630675677312203, "rouge1_precision_stderr": 0.0021365872076980082, "rouge1_recall": 0.2947830429134024, "rouge1_recall_stderr": 0.0047665492114514684, "rouge2_fmeasure": 0.051001464150726095, "rouge2_fmeasure_stderr": 0.0014146940985046523, "rouge2_precision": 0.03437396040114199, "rouge2_precision_stderr": 0.0011646295650624351, "rouge2_recall": 0.13907462394894776, "rouge2_recall_stderr": 0.0031715068123442996, "rougeL_fmeasure": 0.10417993615217146, "rougeL_fmeasure_stderr": 0.002043414019393663, "rougeL_precision": 0.07227253003813615, "rougeL_precision_stderr": 0.001955466598501427, "rougeL_recall": 0.28336049631237586, "rougeL_recall_stderr": 0.004569573466829386, "rougeLsum_fmeasure": 0.10462419732455051, "rougeLsum_fmeasure_stderr": 0.002115527416565905, "rougeLsum_precision": 0.07290032072974349, "rougeLsum_precision_stderr": 0.002034641349821296, "rougeLsum_recall": 0.2825727583996548, "rougeLsum_recall_stderr": 0.004541613699421234}}, "1": {"PALM_prompt": {"bleu": 0.5816764130686213, "bleu_stderr": 0.054147923323820346, "rouge1_fmeasure": 0.11727270364668571, "rouge1_fmeasure_stderr": 0.0021476260743326565, "rouge1_precision": 0.07783086970109386, "rouge1_precision_stderr": 0.001926390801136041, "rouge1_recall": 0.36711372145643695, "rouge1_recall_stderr": 0.005207087210896133, "rouge2_fmeasure": 0.05455324665630359, "rouge2_fmeasure_stderr": 0.0013665856855669073, "rouge2_precision": 0.035941971570071736, "rouge2_precision_stderr": 0.0011116447277521812, "rouge2_recall": 0.17604536516651662, "rouge2_recall_stderr": 0.0036353372642678756, "rougeL_fmeasure": 0.10871994085882164, "rougeL_fmeasure_stderr": 0.0019153699814912425, "rougeL_precision": 0.07209000868611247, "rougeL_precision_stderr": 0.0017559014471748775, "rougeL_recall": 0.33901675935925657, "rougeL_recall_stderr": 0.004687996190515175, "rougeLsum_fmeasure": 0.11053867463797286, "rougeLsum_fmeasure_stderr": 0.0020173857832705686, "rougeLsum_precision": 0.0735230628158407, "rougeLsum_precision_stderr": 0.0018435905984185187, "rougeLsum_recall": 0.34433610068323095, "rougeLsum_recall_stderr": 0.004792623835429767}}, "2": {"PALM_prompt": {"bleu": 0.6801735381749798, "bleu_stderr": 0.04514863078336761, "rouge1_fmeasure": 0.11972585427910692, "rouge1_fmeasure_stderr": 0.001963983917319172, "rouge1_precision": 0.07666591908388082, "rouge1_precision_stderr": 0.0015425382902699712, "rouge1_recall": 0.40028519820196484, "rouge1_recall_stderr": 0.0053218199498182555, "rouge2_fmeasure": 0.055583861631680276, "rouge2_fmeasure_stderr": 0.0012687258801338563, "rouge2_precision": 0.03562073396147935, "rouge2_precision_stderr": 0.0009954366405729978, "rouge2_recall": 0.19724892000097954, "rouge2_recall_stderr": 0.004019172908229666, "rougeL_fmeasure": 0.10999138993564177, "rougeL_fmeasure_stderr": 0.0017648420950003858, "rougeL_precision": 0.07037367893965144, "rougeL_precision_stderr": 0.0013558489579657599, "rougeL_recall": 0.3648716666842121, "rougeL_recall_stderr": 0.004723267009132289, "rougeLsum_fmeasure": 0.1122905686653009, "rougeLsum_fmeasure_stderr": 0.0018425170251459826, "rougeLsum_precision": 0.07198504559682002, "rougeLsum_precision_stderr": 0.0014532243063151847, "rougeLsum_recall": 0.3737640227903558, "rougeLsum_recall_stderr": 0.004892612299725926}}, "3": {"PALM_prompt": {"bleu": 0.8393015493701177, "bleu_stderr": 0.049868510474836, "rouge1_fmeasure": 0.12332290364001403, "rouge1_fmeasure_stderr": 0.0020213255478620834, "rouge1_precision": 0.07934496701755703, "rouge1_precision_stderr": 0.0017108996646152186, "rouge1_recall": 0.41957311539245606, "rouge1_recall_stderr": 0.005383968460349908, "rouge2_fmeasure": 0.05769919253564089, "rouge2_fmeasure_stderr": 0.001315453385968732, "rouge2_precision": 0.03700401446806371, "rouge2_precision_stderr": 0.0010754080423489075, "rouge2_recall": 0.20785188427731804, "rouge2_recall_stderr": 0.0039850136524860235, "rougeL_fmeasure": 0.11171273283361995, "rougeL_fmeasure_stderr": 0.0017620828691461753, "rougeL_precision": 0.07186075331384528, "rougeL_precision_stderr": 0.0015065682717614625, "rougeL_recall": 0.3792930717112658, "rougeL_recall_stderr": 0.004756348746075166, "rougeLsum_fmeasure": 0.11543602475589927, "rougeLsum_fmeasure_stderr": 0.001888123216509075, "rougeLsum_precision": 0.07439776049493438, "rougeLsum_precision_stderr": 0.0016078356678010679, "rougeLsum_recall": 0.39147213524665503, "rougeLsum_recall_stderr": 0.004968164479358761}}, "4": {"PALM_prompt": {"bleu": 0.8436391155168642, "bleu_stderr": 0.049789928911136136, "rouge1_fmeasure": 0.12951808355992003, "rouge1_fmeasure_stderr": 0.0019403013558157668, "rouge1_precision": 0.08188000384427346, "rouge1_precision_stderr": 0.0015429926759057174, "rouge1_recall": 0.45225212783083296, "rouge1_recall_stderr": 0.005452056426418837, "rouge2_fmeasure": 0.06050302495157815, "rouge2_fmeasure_stderr": 0.0012369724150331927, "rouge2_precision": 0.03789712819393572, "rouge2_precision_stderr": 0.0009152136077591718, "rouge2_recall": 0.22689495723762804, "rouge2_recall_stderr": 0.004067188852506108, "rougeL_fmeasure": 0.11592286526905457, "rougeL_fmeasure_stderr": 0.0016906240203803347, "rougeL_precision": 0.07330729359090057, "rougeL_precision_stderr": 0.0013644419423480823, "rougeL_recall": 0.4045943171397325, "rougeL_recall_stderr": 0.0048320778663538955, "rougeLsum_fmeasure": 0.12060522562383363, "rougeLsum_fmeasure_stderr": 0.0018022149452879743, "rougeLsum_precision": 0.07637812548771711, "rougeLsum_precision_stderr": 0.0014540942217052375, "rougeLsum_recall": 0.4207633762253156, "rougeLsum_recall_stderr": 0.005005614020197891}}, "5": {"PALM_prompt": {"bleu": 0.968801645723095, "bleu_stderr": 0.05462868115593731, "rouge1_fmeasure": 0.13468892691608006, "rouge1_fmeasure_stderr": 0.0019240731337134673, "rouge1_precision": 0.08408541082259362, "rouge1_precision_stderr": 0.001415159971957011, "rouge1_recall": 0.4730441639084742, "rouge1_recall_stderr": 0.005431978662817184, "rouge2_fmeasure": 0.06252454636994373, "rouge2_fmeasure_stderr": 0.0012377179103821956, "rouge2_precision": 0.038803485654806714, "rouge2_precision_stderr": 0.0008788058278028121, "rouge2_recall": 0.2379631167113304, "rouge2_recall_stderr": 0.004152278672512853, "rougeL_fmeasure": 0.11896213494013362, "rougeL_fmeasure_stderr": 0.001650634135835398, "rougeL_precision": 0.07423644311431606, "rougeL_precision_stderr": 0.0012142160071926726, "rougeL_recall": 0.4194644730807224, "rougeL_recall_stderr": 0.0047799777603664715, "rougeLsum_fmeasure": 0.12479160570822044, "rougeLsum_fmeasure_stderr": 0.0017844920178026805, "rougeLsum_precision": 0.07800322392434518, "rougeLsum_precision_stderr": 0.0013202039315876929, "rougeLsum_recall": 0.43851085762444947, "rougeLsum_recall_stderr": 0.004968418620592047}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.8310798418084737, "bleu_stderr": 0.06119305662626742, "rouge1_fmeasure": 0.19188833167040015, "rouge1_fmeasure_stderr": 0.0018702943944022418, "rouge1_precision": 0.1628352267958049, "rouge1_precision_stderr": 0.0019076289743005436, "rouge1_recall": 0.28137034403026634, "rouge1_recall_stderr": 0.002726139405286867, "rouge2_fmeasure": 0.04191366659253858, "rouge2_fmeasure_stderr": 0.000917658619128283, "rouge2_precision": 0.035168122195033014, "rouge2_precision_stderr": 0.0008051014260490183, "rouge2_recall": 0.06371619685684557, "rouge2_recall_stderr": 0.0015314676134498058, "rougeL_fmeasure": 0.14614932555210797, "rougeL_fmeasure_stderr": 0.0012956973040530264, "rougeL_precision": 0.12246239102421737, "rougeL_precision_stderr": 0.001285311648356421, "rougeL_recall": 0.2201424968967575, "rougeL_recall_stderr": 0.0022006617198564397, "rougeLsum_fmeasure": 0.17662543049894286, "rougeLsum_fmeasure_stderr": 0.001716560474974557, "rougeLsum_precision": 0.14960133985735732, "rougeLsum_precision_stderr": 0.0017445585328390875, "rougeLsum_recall": 0.2600296050390341, "rougeLsum_recall_stderr": 0.002552101400202146}}, "1": {"tldr_en": {"bleu": 2.463691563435929, "bleu_stderr": 0.056238669871763534, "rouge1_fmeasure": 0.20607218590152535, "rouge1_fmeasure_stderr": 0.0019348192676270253, "rouge1_precision": 0.17507251962841255, "rouge1_precision_stderr": 0.0019979193178610104, "rouge1_recall": 0.3008600636001003, "rouge1_recall_stderr": 0.002760929751246215, "rouge2_fmeasure": 0.04827166271383884, "rouge2_fmeasure_stderr": 0.000981638580890638, "rouge2_precision": 0.040814840621502924, "rouge2_precision_stderr": 0.0008769075552676463, "rouge2_recall": 0.07239534384858555, "rouge2_recall_stderr": 0.0015745819458817655, "rougeL_fmeasure": 0.14993882097099004, "rougeL_fmeasure_stderr": 0.0013143528095662098, "rougeL_precision": 0.12604513290190442, "rougeL_precision_stderr": 0.001331288983523961, "rougeL_recall": 0.22410236240350495, "rougeL_recall_stderr": 0.0021252698749924166, "rougeLsum_fmeasure": 0.19267274247494454, "rougeLsum_fmeasure_stderr": 0.0018042087124662393, "rougeLsum_precision": 0.1634506211607229, "rougeLsum_precision_stderr": 0.0018573519091366366, "rougeLsum_recall": 0.282240921650429, "rougeLsum_recall_stderr": 0.002619133620350951}}, "2": {"tldr_en": {"bleu": 3.013745331324814, "bleu_stderr": 0.06592229302797037, "rouge1_fmeasure": 0.22344837173188767, "rouge1_fmeasure_stderr": 0.0018933971382747686, "rouge1_precision": 0.19198287608229725, "rouge1_precision_stderr": 0.0021092410953077365, "rouge1_recall": 0.32589953688933776, "rouge1_recall_stderr": 0.002753157025629824, "rouge2_fmeasure": 0.05600464749019431, "rouge2_fmeasure_stderr": 0.001035304930307184, "rouge2_precision": 0.048137852198349124, "rouge2_precision_stderr": 0.0010124293758406351, "rouge2_recall": 0.08482162860321114, "rouge2_recall_stderr": 0.0017481381213950958, "rougeL_fmeasure": 0.15854970566001372, "rougeL_fmeasure_stderr": 0.0012774189115518982, "rougeL_precision": 0.13481580311964722, "rougeL_precision_stderr": 0.0014071194011638184, "rougeL_recall": 0.23733416551629913, "rougeL_recall_stderr": 0.00219502068666546, "rougeLsum_fmeasure": 0.21107970658856343, "rougeLsum_fmeasure_stderr": 0.0017866890110014018, "rougeLsum_precision": 0.18117711700554173, "rougeLsum_precision_stderr": 0.0019909544088953742, "rougeLsum_recall": 0.30882558318305736, "rougeLsum_recall_stderr": 0.0026454321668364913}}, "3": {"tldr_en": {"bleu": 2.986263870634216, "bleu_stderr": 0.0869488541072268, "rouge1_fmeasure": 0.1883894874399809, "rouge1_fmeasure_stderr": 0.0022518375274135087, "rouge1_precision": 0.16688833624223554, "rouge1_precision_stderr": 0.0024001395431129838, "rouge1_recall": 0.27404204769144364, "rouge1_recall_stderr": 0.003375962083519381, "rouge2_fmeasure": 0.04708204631655152, "rouge2_fmeasure_stderr": 0.0010069752068567668, "rouge2_precision": 0.04136653299690803, "rouge2_precision_stderr": 0.0010174651332227344, "rouge2_recall": 0.07172155304298297, "rouge2_recall_stderr": 0.0017383368313820028, "rougeL_fmeasure": 0.13391288059874437, "rougeL_fmeasure_stderr": 0.001564486351392238, "rougeL_precision": 0.11810699062222797, "rougeL_precision_stderr": 0.001714836740267086, "rougeL_recall": 0.19959382723616914, "rougeL_recall_stderr": 0.002605476332476844, "rougeLsum_fmeasure": 0.17834159419543702, "rougeLsum_fmeasure_stderr": 0.002133544799751138, "rougeLsum_precision": 0.15789252138792423, "rougeLsum_precision_stderr": 0.0022764600108588808, "rougeLsum_recall": 0.26023963410461465, "rougeLsum_recall_stderr": 0.003240567155894993}}, "4": {"tldr_en": {"bleu": 0.7653537900871676, "bleu_stderr": 0.049323102026813614, "rouge1_fmeasure": 0.06229352605061771, "rouge1_fmeasure_stderr": 0.002059973094043126, "rouge1_precision": 0.05571162128505982, "rouge1_precision_stderr": 0.0019790843122430768, "rouge1_recall": 0.09490155511879607, "rouge1_recall_stderr": 0.003216109108055619, "rouge2_fmeasure": 0.01596233535299914, "rouge2_fmeasure_stderr": 0.0007332643966253175, "rouge2_precision": 0.013746904154847127, "rouge2_precision_stderr": 0.000658207908545434, "rouge2_recall": 0.02605352352304377, "rouge2_recall_stderr": 0.0013317962968417952, "rougeL_fmeasure": 0.04502172063067562, "rougeL_fmeasure_stderr": 0.0014806563009305842, "rougeL_precision": 0.04028867827357096, "rougeL_precision_stderr": 0.0014354449068175981, "rougeL_recall": 0.07013160574213247, "rougeL_recall_stderr": 0.0024380518058086703, "rougeLsum_fmeasure": 0.05884434373920439, "rougeLsum_fmeasure_stderr": 0.0019486127272992522, "rougeLsum_precision": 0.0526446584524354, "rougeLsum_precision_stderr": 0.0018735818695187819, "rougeLsum_recall": 0.08977749314111799, "rougeLsum_recall_stderr": 0.0030541817569144}}, "5": {"tldr_en": {"bleu": 2.313122283850011e-06, "bleu_stderr": 4.438365983452618e-06, "rouge1_fmeasure": 0.010262809104423124, "rouge1_fmeasure_stderr": 0.000941433577191994, "rouge1_precision": 0.009989872401580602, "rouge1_precision_stderr": 0.0009952828000827124, "rouge1_recall": 0.015276156161555617, "rouge1_recall_stderr": 0.001444671711873694, "rouge2_fmeasure": 0.0025848086028049965, "rouge2_fmeasure_stderr": 0.0003301678198930521, "rouge2_precision": 0.0024167502513949756, "rouge2_precision_stderr": 0.00033798462070527304, "rouge2_recall": 0.004213844763908219, "rouge2_recall_stderr": 0.0006183240014712593, "rougeL_fmeasure": 0.007530769725575652, "rougeL_fmeasure_stderr": 0.0006918159097560924, "rougeL_precision": 0.007350732687047713, "rougeL_precision_stderr": 0.000736612195363616, "rougeL_recall": 0.011512337859684543, "rougeL_recall_stderr": 0.0011311946838375108, "rougeLsum_fmeasure": 0.009601062566883777, "rougeLsum_fmeasure_stderr": 0.0008823539930029628, "rougeLsum_precision": 0.009401823549618816, "rougeLsum_precision_stderr": 0.0009477935776177291, "rougeLsum_recall": 0.014304994804442096, "rougeLsum_recall_stderr": 0.0013529102153531946}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.3016960813220789, "bleu_stderr": 0.04082243698870125, "rouge1_fmeasure": 0.08446489060239885, "rouge1_fmeasure_stderr": 0.000832880994211257, "rouge1_precision": 0.0628362517057776, "rouge1_precision_stderr": 0.0006647851313912593, "rouge1_recall": 0.137799699754724, "rouge1_recall_stderr": 0.0013399199497499206, "rouge2_fmeasure": 0.013098749477153882, "rouge2_fmeasure_stderr": 0.00036393407871382073, "rouge2_precision": 0.009897778438587298, "rouge2_precision_stderr": 0.0002813167869752497, "rouge2_recall": 0.02030644827526888, "rouge2_recall_stderr": 0.0005626168002295672, "rougeL_fmeasure": 0.08185345358329085, "rougeL_fmeasure_stderr": 0.0007758077715250189, "rougeL_precision": 0.060894085509621707, "rougeL_precision_stderr": 0.0006165662598640652, "rougeL_recall": 0.1334600824136164, "rougeL_recall_stderr": 0.0012622226683087598, "rougeLsum_fmeasure": 0.07821933484300712, "rougeLsum_fmeasure_stderr": 0.000742258462877918, "rougeLsum_precision": 0.05822489106803366, "rougeLsum_precision_stderr": 0.00059732648786741, "rougeLsum_recall": 0.12759747263850774, "rougeLsum_recall_stderr": 0.0011920315451119745}}, "1": {"generate_text_restaurant": {"bleu": 9.014911506469076, "bleu_stderr": 0.10639404635916568, "rouge1_fmeasure": 0.4071860181343587, "rouge1_fmeasure_stderr": 0.002172401493926826, "rouge1_precision": 0.4220547738164344, "rouge1_precision_stderr": 0.0029747898853502135, "rouge1_recall": 0.44890682882262095, "rouge1_recall_stderr": 0.0030482256607117985, "rouge2_fmeasure": 0.17675735102863532, "rouge2_fmeasure_stderr": 0.0016983524897307275, "rouge2_precision": 0.18398717679998686, "rouge2_precision_stderr": 0.002070314538395242, "rouge2_recall": 0.19695708046780447, "rouge2_recall_stderr": 0.0021253468809232486, "rougeL_fmeasure": 0.2901955244768342, "rougeL_fmeasure_stderr": 0.0017445755551563752, "rougeL_precision": 0.301979337271323, "rougeL_precision_stderr": 0.002426170411020153, "rougeL_recall": 0.32120889976820555, "rougeL_recall_stderr": 0.0024556761259021593, "rougeLsum_fmeasure": 0.3366538075182426, "rougeLsum_fmeasure_stderr": 0.0020840116286440125, "rougeLsum_precision": 0.3498631569304881, "rougeLsum_precision_stderr": 0.00275693533807641, "rougeLsum_recall": 0.3704033862486897, "rougeLsum_recall_stderr": 0.00277359280896307}}, "2": {"generate_text_restaurant": {"bleu": 11.1355912496663, "bleu_stderr": 0.13104161087327487, "rouge1_fmeasure": 0.4449026725503146, "rouge1_fmeasure_stderr": 0.0020303864410380614, "rouge1_precision": 0.45419518183344554, "rouge1_precision_stderr": 0.0027266345121146297, "rouge1_recall": 0.48044140772752547, "rouge1_recall_stderr": 0.002817950150203246, "rouge2_fmeasure": 0.20791813280930055, "rouge2_fmeasure_stderr": 0.0017732074653438918, "rouge2_precision": 0.2124884501715128, "rouge2_precision_stderr": 0.0020421137596173576, "rouge2_recall": 0.22663601203825093, "rouge2_recall_stderr": 0.0021799430214164035, "rougeL_fmeasure": 0.32200623778436555, "rougeL_fmeasure_stderr": 0.0017788348099423016, "rougeL_precision": 0.32873823752244813, "rougeL_precision_stderr": 0.002262430136729236, "rougeL_recall": 0.34880236417169236, "rougeL_recall_stderr": 0.0024064753259342344, "rougeLsum_fmeasure": 0.37259609212766276, "rougeLsum_fmeasure_stderr": 0.0020305961255707534, "rougeLsum_precision": 0.38047010773438, "rougeLsum_precision_stderr": 0.002559842275120574, "rougeLsum_recall": 0.40224234667337105, "rougeLsum_recall_stderr": 0.002660545444016772}}, "3": {"generate_text_restaurant": {"bleu": 12.10470203084112, "bleu_stderr": 0.160895558301053, "rouge1_fmeasure": 0.4492382386896765, "rouge1_fmeasure_stderr": 0.0019699301602619833, "rouge1_precision": 0.45456971607102353, "rouge1_precision_stderr": 0.0023960763531178423, "rouge1_recall": 0.47953391372338766, "rouge1_recall_stderr": 0.0027865520920584822, "rouge2_fmeasure": 0.2138339925008178, "rouge2_fmeasure_stderr": 0.0018445417129634963, "rouge2_precision": 0.2163784140584202, "rouge2_precision_stderr": 0.001980036007401307, "rouge2_recall": 0.23028758666497792, "rouge2_recall_stderr": 0.0022756293438486903, "rougeL_fmeasure": 0.3303919839118561, "rougeL_fmeasure_stderr": 0.001783388701966232, "rougeL_precision": 0.33426681511660167, "rougeL_precision_stderr": 0.002069523940604892, "rougeL_recall": 0.35354783483337526, "rougeL_recall_stderr": 0.0024228991972543336, "rougeLsum_fmeasure": 0.378239342272828, "rougeLsum_fmeasure_stderr": 0.002043462221499771, "rougeLsum_precision": 0.3827360863233357, "rougeLsum_precision_stderr": 0.002359678766192712, "rougeLsum_recall": 0.40399025031746394, "rougeLsum_recall_stderr": 0.002710450542079631}}, "4": {"generate_text_restaurant": {"bleu": 12.44085622955195, "bleu_stderr": 0.13067375364343567, "rouge1_fmeasure": 0.4511875093908521, "rouge1_fmeasure_stderr": 0.001948885960105966, "rouge1_precision": 0.45479182860495637, "rouge1_precision_stderr": 0.0023107278825576553, "rouge1_recall": 0.48017725960351876, "rouge1_recall_stderr": 0.0027316537664281945, "rouge2_fmeasure": 0.21707781052843164, "rouge2_fmeasure_stderr": 0.00185164014349205, "rouge2_precision": 0.21833473257724795, "rouge2_precision_stderr": 0.0019502917664114363, "rouge2_recall": 0.23333710855434647, "rouge2_recall_stderr": 0.002269432186215575, "rougeL_fmeasure": 0.33434874026181494, "rougeL_fmeasure_stderr": 0.0017786642192651214, "rougeL_precision": 0.3366370287941819, "rougeL_precision_stderr": 0.0019965287497770777, "rougeL_recall": 0.35684028344910274, "rougeL_recall_stderr": 0.0024057773390550254, "rougeLsum_fmeasure": 0.38297835064075053, "rougeLsum_fmeasure_stderr": 0.0020438066934495387, "rougeLsum_precision": 0.38571523251054607, "rougeLsum_precision_stderr": 0.0022865314323742805, "rougeLsum_recall": 0.40803304829345277, "rougeLsum_recall_stderr": 0.0027038368009947755}}, "5": {"generate_text_restaurant": {"bleu": 12.352678257477072, "bleu_stderr": 0.1229858870702429, "rouge1_fmeasure": 0.4520495564262539, "rouge1_fmeasure_stderr": 0.0019091091728356577, "rouge1_precision": 0.4536679355694742, "rouge1_precision_stderr": 0.002282805690194143, "rouge1_recall": 0.48257839264210245, "rouge1_recall_stderr": 0.002694452362144022, "rouge2_fmeasure": 0.21807002700335165, "rouge2_fmeasure_stderr": 0.0018243298667958761, "rouge2_precision": 0.2184937204533077, "rouge2_precision_stderr": 0.0019084340007320115, "rouge2_recall": 0.2348008779716612, "rouge2_recall_stderr": 0.0022303214323899125, "rougeL_fmeasure": 0.336313193439231, "rougeL_fmeasure_stderr": 0.001791714678974818, "rougeL_precision": 0.33692930278278704, "rougeL_precision_stderr": 0.0019923203298548866, "rougeL_recall": 0.36002100624003847, "rougeL_recall_stderr": 0.0024053149539646364, "rougeLsum_fmeasure": 0.38490335194628533, "rougeLsum_fmeasure_stderr": 0.002018489316229454, "rougeLsum_precision": 0.38599958036488224, "rougeLsum_precision_stderr": 0.002272531904720447, "rougeLsum_recall": 0.4113378384062774, "rougeLsum_recall_stderr": 0.002667546056299814}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.194933977793404, "bleu_stderr": 0.09650169688197711, "rouge1_fmeasure": 0.2157374708631877, "rouge1_fmeasure_stderr": 0.002508783327476019, "rouge1_precision": 0.15954668519186138, "rouge1_precision_stderr": 0.002100617390256336, "rouge1_recall": 0.36309739763041776, "rouge1_recall_stderr": 0.004368615445871372, "rouge2_fmeasure": 0.05279448511938101, "rouge2_fmeasure_stderr": 0.001664044690700437, "rouge2_precision": 0.03857312620535916, "rouge2_precision_stderr": 0.0012905906114606838, "rouge2_recall": 0.09177566784631346, "rouge2_recall_stderr": 0.0029866334470645636, "rougeL_fmeasure": 0.1604319788556621, "rougeL_fmeasure_stderr": 0.0019556284658093183, "rougeL_precision": 0.11839547499241576, "rougeL_precision_stderr": 0.001646949604960197, "rougeL_recall": 0.27198898577078495, "rougeL_recall_stderr": 0.0035222426961979128, "rougeLsum_fmeasure": 0.1697976651408494, "rougeLsum_fmeasure_stderr": 0.002135233558113895, "rougeLsum_precision": 0.1250746587410693, "rougeLsum_precision_stderr": 0.0017464873888176696, "rougeLsum_recall": 0.28828909491111, "rougeLsum_recall_stderr": 0.0038785736556512756}}, "1": {"article_DOC_summary": {"bleu": 1.638275312642142, "bleu_stderr": 0.11197811159644402, "rouge1_fmeasure": 0.18261422772647087, "rouge1_fmeasure_stderr": 0.00258517997013667, "rouge1_precision": 0.130085376048313, "rouge1_precision_stderr": 0.0019273226219467348, "rouge1_recall": 0.31899443410148204, "rouge1_recall_stderr": 0.004375864955378675, "rouge2_fmeasure": 0.03829439148207006, "rouge2_fmeasure_stderr": 0.0015268296315399182, "rouge2_precision": 0.027035766317831913, "rouge2_precision_stderr": 0.0010834329723279196, "rouge2_recall": 0.06843246331973844, "rouge2_recall_stderr": 0.002783755254166952, "rougeL_fmeasure": 0.13872448607191418, "rougeL_fmeasure_stderr": 0.0019214547149178605, "rougeL_precision": 0.09860861520075329, "rougeL_precision_stderr": 0.0014180838773530536, "rougeL_recall": 0.2439616017337858, "rougeL_recall_stderr": 0.003388978319546548, "rougeLsum_fmeasure": 0.14673205236069722, "rougeLsum_fmeasure_stderr": 0.002156691093638724, "rougeLsum_precision": 0.10431817474650688, "rougeLsum_precision_stderr": 0.0015880489605249845, "rougeLsum_recall": 0.25778306821268704, "rougeLsum_recall_stderr": 0.0037812899736781526}}, "2": {"article_DOC_summary": {"bleu": 2.0210914476360786, "bleu_stderr": 0.12454216962651855, "rouge1_fmeasure": 0.1978001315070318, "rouge1_fmeasure_stderr": 0.002669936318830053, "rouge1_precision": 0.14059986141007405, "rouge1_precision_stderr": 0.001980010991959053, "rouge1_recall": 0.34704590818514364, "rouge1_recall_stderr": 0.004609783640663682, "rouge2_fmeasure": 0.04760922765448965, "rouge2_fmeasure_stderr": 0.001702755511678876, "rouge2_precision": 0.033448694107769246, "rouge2_precision_stderr": 0.0011984145262489562, "rouge2_recall": 0.08605872654054372, "rouge2_recall_stderr": 0.003191665944600775, "rougeL_fmeasure": 0.1511855886298728, "rougeL_fmeasure_stderr": 0.0020261890172370246, "rougeL_precision": 0.10721276455620249, "rougeL_precision_stderr": 0.0014796729903083395, "rougeL_recall": 0.26727650609085896, "rougeL_recall_stderr": 0.0036995069904560843, "rougeLsum_fmeasure": 0.16004925294103853, "rougeLsum_fmeasure_stderr": 0.0022364087298009884, "rougeLsum_precision": 0.11346372550782761, "rougeLsum_precision_stderr": 0.0016303825401644061, "rougeLsum_recall": 0.28288787023938955, "rougeLsum_recall_stderr": 0.004038388892533831}}, "3": {"article_DOC_summary": {"bleu": 2.03791695157546, "bleu_stderr": 0.1342348737065576, "rouge1_fmeasure": 0.19482488018157684, "rouge1_fmeasure_stderr": 0.0028449440307477085, "rouge1_precision": 0.14077378985561326, "rouge1_precision_stderr": 0.002167627071465062, "rouge1_recall": 0.3365881713882994, "rouge1_recall_stderr": 0.004904093832075129, "rouge2_fmeasure": 0.04702347182983231, "rouge2_fmeasure_stderr": 0.001623530160402657, "rouge2_precision": 0.03346907677060051, "rouge2_precision_stderr": 0.00115993469618386, "rouge2_recall": 0.08385926778368435, "rouge2_recall_stderr": 0.0030304591639245464, "rougeL_fmeasure": 0.14665273300314152, "rougeL_fmeasure_stderr": 0.00212580713201847, "rougeL_precision": 0.10579758272401656, "rougeL_precision_stderr": 0.0016080916023179224, "rougeL_recall": 0.25509688514423123, "rougeL_recall_stderr": 0.003826588022233898, "rougeLsum_fmeasure": 0.15621401458929957, "rougeLsum_fmeasure_stderr": 0.00234420229722605, "rougeLsum_precision": 0.11263878026618175, "rougeLsum_precision_stderr": 0.00176128619109869, "rougeLsum_recall": 0.2715816804884895, "rougeLsum_recall_stderr": 0.004199504898315826}}, "4": {"article_DOC_summary": {"bleu": 0.9907623400783704, "bleu_stderr": 0.12144441541342028, "rouge1_fmeasure": 0.05341124486563334, "rouge1_fmeasure_stderr": 0.0029118807193791816, "rouge1_precision": 0.04332722979741195, "rouge1_precision_stderr": 0.0025397484135024503, "rouge1_recall": 0.08638484320409213, "rouge1_recall_stderr": 0.004855712915740289, "rouge2_fmeasure": 0.0126600494371876, "rouge2_fmeasure_stderr": 0.0010393927624132967, "rouge2_precision": 0.009371982127377696, "rouge2_precision_stderr": 0.0007761604812612375, "rouge2_recall": 0.02197790550576286, "rouge2_recall_stderr": 0.0018737167719494205, "rougeL_fmeasure": 0.04031062592059575, "rougeL_fmeasure_stderr": 0.0022065055222514843, "rougeL_precision": 0.03308111037957417, "rougeL_precision_stderr": 0.002037483672167313, "rougeL_recall": 0.06547067873377617, "rougeL_recall_stderr": 0.0037402890017668913, "rougeLsum_fmeasure": 0.04344922023460101, "rougeLsum_fmeasure_stderr": 0.002384615926685988, "rougeLsum_precision": 0.03547102946933422, "rougeLsum_precision_stderr": 0.002145817702406172, "rougeLsum_recall": 0.07066149952031933, "rougeLsum_recall_stderr": 0.004048133700252855}}, "5": {"article_DOC_summary": {"bleu": 1.959625061740418e-19, "bleu_stderr": 1.1984703371139495e-15, "rouge1_fmeasure": 0.0026267053439145405, "rouge1_fmeasure_stderr": 0.0006851925438686376, "rouge1_precision": 0.002432491256372859, "rouge1_precision_stderr": 0.0006531802890059347, "rouge1_recall": 0.0035688165030904754, "rouge1_recall_stderr": 0.000969634171191206, "rouge2_fmeasure": 0.0003000575858304317, "rouge2_fmeasure_stderr": 0.00012455541465929192, "rouge2_precision": 0.0002462802129838755, "rouge2_precision_stderr": 0.00010248150275091556, "rouge2_recall": 0.00045873937383371344, "rouge2_recall_stderr": 0.0002027676865696476, "rougeL_fmeasure": 0.001963800127619678, "rougeL_fmeasure_stderr": 0.0005077215919641312, "rougeL_precision": 0.001824459823195743, "rougeL_precision_stderr": 0.000493813415204384, "rougeL_recall": 0.002706254470313614, "rougeL_recall_stderr": 0.0007319861753860738, "rougeLsum_fmeasure": 0.002094899399917077, "rougeLsum_fmeasure_stderr": 0.0005500941719652507, "rougeLsum_precision": 0.001947339725356528, "rougeLsum_precision_stderr": 0.0005292516418451348, "rougeLsum_recall": 0.0028418243363945105, "rougeLsum_recall_stderr": 0.0007704086076680828}}}}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "GEM/wiki_lingua_en",
|
5 |
+
"prompt_name": "tldr_en",
|
6 |
+
"rouge1_precision": 0.19198287608229725,
|
7 |
+
"dataset_path": "GEM/wiki_lingua",
|
8 |
+
"dataset_name": "en",
|
9 |
+
"subset": null,
|
10 |
+
"rouge1_precision_stderr": 0.0021092410953077365
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "GEM/wiki_lingua_en",
|
14 |
+
"prompt_name": "tldr_en",
|
15 |
+
"rouge1_recall": 0.32589953688933776,
|
16 |
+
"dataset_path": "GEM/wiki_lingua",
|
17 |
+
"dataset_name": "en",
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_recall_stderr": 0.002753157025629824
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "GEM/wiki_lingua_en",
|
23 |
+
"prompt_name": "tldr_en",
|
24 |
+
"rouge1_fmeasure": 0.22344837173188767,
|
25 |
+
"dataset_path": "GEM/wiki_lingua",
|
26 |
+
"dataset_name": "en",
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_fmeasure_stderr": 0.0018933971382747686
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "GEM/wiki_lingua_en",
|
32 |
+
"prompt_name": "tldr_en",
|
33 |
+
"rouge2_precision": 0.048137852198349124,
|
34 |
+
"dataset_path": "GEM/wiki_lingua",
|
35 |
+
"dataset_name": "en",
|
36 |
+
"subset": null,
|
37 |
+
"rouge2_precision_stderr": 0.0010124293758406351
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "GEM/wiki_lingua_en",
|
41 |
+
"prompt_name": "tldr_en",
|
42 |
+
"rouge2_recall": 0.08482162860321114,
|
43 |
+
"dataset_path": "GEM/wiki_lingua",
|
44 |
+
"dataset_name": "en",
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_recall_stderr": 0.0017481381213950958
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "GEM/wiki_lingua_en",
|
50 |
+
"prompt_name": "tldr_en",
|
51 |
+
"rouge2_fmeasure": 0.05600464749019431,
|
52 |
+
"dataset_path": "GEM/wiki_lingua",
|
53 |
+
"dataset_name": "en",
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_fmeasure_stderr": 0.001035304930307184
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "GEM/wiki_lingua_en",
|
59 |
+
"prompt_name": "tldr_en",
|
60 |
+
"rougeL_precision": 0.13481580311964722,
|
61 |
+
"dataset_path": "GEM/wiki_lingua",
|
62 |
+
"dataset_name": "en",
|
63 |
+
"subset": null,
|
64 |
+
"rougeL_precision_stderr": 0.0014071194011638184
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "GEM/wiki_lingua_en",
|
68 |
+
"prompt_name": "tldr_en",
|
69 |
+
"rougeL_recall": 0.23733416551629913,
|
70 |
+
"dataset_path": "GEM/wiki_lingua",
|
71 |
+
"dataset_name": "en",
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_recall_stderr": 0.00219502068666546
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "GEM/wiki_lingua_en",
|
77 |
+
"prompt_name": "tldr_en",
|
78 |
+
"rougeL_fmeasure": 0.15854970566001372,
|
79 |
+
"dataset_path": "GEM/wiki_lingua",
|
80 |
+
"dataset_name": "en",
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_fmeasure_stderr": 0.0012774189115518982
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "GEM/wiki_lingua_en",
|
86 |
+
"prompt_name": "tldr_en",
|
87 |
+
"rougeLsum_precision": 0.18117711700554173,
|
88 |
+
"dataset_path": "GEM/wiki_lingua",
|
89 |
+
"dataset_name": "en",
|
90 |
+
"subset": null,
|
91 |
+
"rougeLsum_precision_stderr": 0.0019909544088953742
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "GEM/wiki_lingua_en",
|
95 |
+
"prompt_name": "tldr_en",
|
96 |
+
"rougeLsum_recall": 0.30882558318305736,
|
97 |
+
"dataset_path": "GEM/wiki_lingua",
|
98 |
+
"dataset_name": "en",
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_recall_stderr": 0.0026454321668364913
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "GEM/wiki_lingua_en",
|
104 |
+
"prompt_name": "tldr_en",
|
105 |
+
"rougeLsum_fmeasure": 0.21107970658856343,
|
106 |
+
"dataset_path": "GEM/wiki_lingua",
|
107 |
+
"dataset_name": "en",
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.0017866890110014018
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "GEM/wiki_lingua_en",
|
113 |
+
"prompt_name": "tldr_en",
|
114 |
+
"bleu": 3.013745331324814,
|
115 |
+
"dataset_path": "GEM/wiki_lingua",
|
116 |
+
"dataset_name": "en",
|
117 |
+
"subset": null,
|
118 |
+
"bleu_stderr": 0.06592229302797037
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 2,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "GEM/wiki_lingua_en",
|
5 |
+
"prompt_name": "tldr_en",
|
6 |
+
"rouge1_precision": 0.16688833624223554,
|
7 |
+
"dataset_path": "GEM/wiki_lingua",
|
8 |
+
"dataset_name": "en",
|
9 |
+
"subset": null,
|
10 |
+
"rouge1_precision_stderr": 0.0024001395431129838
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "GEM/wiki_lingua_en",
|
14 |
+
"prompt_name": "tldr_en",
|
15 |
+
"rouge1_recall": 0.27404204769144364,
|
16 |
+
"dataset_path": "GEM/wiki_lingua",
|
17 |
+
"dataset_name": "en",
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_recall_stderr": 0.003375962083519381
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "GEM/wiki_lingua_en",
|
23 |
+
"prompt_name": "tldr_en",
|
24 |
+
"rouge1_fmeasure": 0.1883894874399809,
|
25 |
+
"dataset_path": "GEM/wiki_lingua",
|
26 |
+
"dataset_name": "en",
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_fmeasure_stderr": 0.0022518375274135087
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "GEM/wiki_lingua_en",
|
32 |
+
"prompt_name": "tldr_en",
|
33 |
+
"rouge2_precision": 0.04136653299690803,
|
34 |
+
"dataset_path": "GEM/wiki_lingua",
|
35 |
+
"dataset_name": "en",
|
36 |
+
"subset": null,
|
37 |
+
"rouge2_precision_stderr": 0.0010174651332227344
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "GEM/wiki_lingua_en",
|
41 |
+
"prompt_name": "tldr_en",
|
42 |
+
"rouge2_recall": 0.07172155304298297,
|
43 |
+
"dataset_path": "GEM/wiki_lingua",
|
44 |
+
"dataset_name": "en",
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_recall_stderr": 0.0017383368313820028
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "GEM/wiki_lingua_en",
|
50 |
+
"prompt_name": "tldr_en",
|
51 |
+
"rouge2_fmeasure": 0.04708204631655152,
|
52 |
+
"dataset_path": "GEM/wiki_lingua",
|
53 |
+
"dataset_name": "en",
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_fmeasure_stderr": 0.0010069752068567668
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "GEM/wiki_lingua_en",
|
59 |
+
"prompt_name": "tldr_en",
|
60 |
+
"rougeL_precision": 0.11810699062222797,
|
61 |
+
"dataset_path": "GEM/wiki_lingua",
|
62 |
+
"dataset_name": "en",
|
63 |
+
"subset": null,
|
64 |
+
"rougeL_precision_stderr": 0.001714836740267086
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "GEM/wiki_lingua_en",
|
68 |
+
"prompt_name": "tldr_en",
|
69 |
+
"rougeL_recall": 0.19959382723616914,
|
70 |
+
"dataset_path": "GEM/wiki_lingua",
|
71 |
+
"dataset_name": "en",
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_recall_stderr": 0.002605476332476844
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "GEM/wiki_lingua_en",
|
77 |
+
"prompt_name": "tldr_en",
|
78 |
+
"rougeL_fmeasure": 0.13391288059874437,
|
79 |
+
"dataset_path": "GEM/wiki_lingua",
|
80 |
+
"dataset_name": "en",
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_fmeasure_stderr": 0.001564486351392238
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "GEM/wiki_lingua_en",
|
86 |
+
"prompt_name": "tldr_en",
|
87 |
+
"rougeLsum_precision": 0.15789252138792423,
|
88 |
+
"dataset_path": "GEM/wiki_lingua",
|
89 |
+
"dataset_name": "en",
|
90 |
+
"subset": null,
|
91 |
+
"rougeLsum_precision_stderr": 0.0022764600108588808
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "GEM/wiki_lingua_en",
|
95 |
+
"prompt_name": "tldr_en",
|
96 |
+
"rougeLsum_recall": 0.26023963410461465,
|
97 |
+
"dataset_path": "GEM/wiki_lingua",
|
98 |
+
"dataset_name": "en",
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_recall_stderr": 0.003240567155894993
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "GEM/wiki_lingua_en",
|
104 |
+
"prompt_name": "tldr_en",
|
105 |
+
"rougeLsum_fmeasure": 0.17834159419543702,
|
106 |
+
"dataset_path": "GEM/wiki_lingua",
|
107 |
+
"dataset_name": "en",
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.002133544799751138
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "GEM/wiki_lingua_en",
|
113 |
+
"prompt_name": "tldr_en",
|
114 |
+
"bleu": 2.986263870634216,
|
115 |
+
"dataset_path": "GEM/wiki_lingua",
|
116 |
+
"dataset_name": "en",
|
117 |
+
"subset": null,
|
118 |
+
"bleu_stderr": 0.0869488541072268
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 3,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "GEM/wiki_lingua_en",
|
5 |
+
"prompt_name": "tldr_en",
|
6 |
+
"rouge1_precision": 0.05571162128505982,
|
7 |
+
"dataset_path": "GEM/wiki_lingua",
|
8 |
+
"dataset_name": "en",
|
9 |
+
"subset": null,
|
10 |
+
"rouge1_precision_stderr": 0.0019790843122430768
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "GEM/wiki_lingua_en",
|
14 |
+
"prompt_name": "tldr_en",
|
15 |
+
"rouge1_recall": 0.09490155511879607,
|
16 |
+
"dataset_path": "GEM/wiki_lingua",
|
17 |
+
"dataset_name": "en",
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_recall_stderr": 0.003216109108055619
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "GEM/wiki_lingua_en",
|
23 |
+
"prompt_name": "tldr_en",
|
24 |
+
"rouge1_fmeasure": 0.06229352605061771,
|
25 |
+
"dataset_path": "GEM/wiki_lingua",
|
26 |
+
"dataset_name": "en",
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_fmeasure_stderr": 0.002059973094043126
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "GEM/wiki_lingua_en",
|
32 |
+
"prompt_name": "tldr_en",
|
33 |
+
"rouge2_precision": 0.013746904154847127,
|
34 |
+
"dataset_path": "GEM/wiki_lingua",
|
35 |
+
"dataset_name": "en",
|
36 |
+
"subset": null,
|
37 |
+
"rouge2_precision_stderr": 0.000658207908545434
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "GEM/wiki_lingua_en",
|
41 |
+
"prompt_name": "tldr_en",
|
42 |
+
"rouge2_recall": 0.02605352352304377,
|
43 |
+
"dataset_path": "GEM/wiki_lingua",
|
44 |
+
"dataset_name": "en",
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_recall_stderr": 0.0013317962968417952
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "GEM/wiki_lingua_en",
|
50 |
+
"prompt_name": "tldr_en",
|
51 |
+
"rouge2_fmeasure": 0.01596233535299914,
|
52 |
+
"dataset_path": "GEM/wiki_lingua",
|
53 |
+
"dataset_name": "en",
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_fmeasure_stderr": 0.0007332643966253175
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "GEM/wiki_lingua_en",
|
59 |
+
"prompt_name": "tldr_en",
|
60 |
+
"rougeL_precision": 0.04028867827357096,
|
61 |
+
"dataset_path": "GEM/wiki_lingua",
|
62 |
+
"dataset_name": "en",
|
63 |
+
"subset": null,
|
64 |
+
"rougeL_precision_stderr": 0.0014354449068175981
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "GEM/wiki_lingua_en",
|
68 |
+
"prompt_name": "tldr_en",
|
69 |
+
"rougeL_recall": 0.07013160574213247,
|
70 |
+
"dataset_path": "GEM/wiki_lingua",
|
71 |
+
"dataset_name": "en",
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_recall_stderr": 0.0024380518058086703
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "GEM/wiki_lingua_en",
|
77 |
+
"prompt_name": "tldr_en",
|
78 |
+
"rougeL_fmeasure": 0.04502172063067562,
|
79 |
+
"dataset_path": "GEM/wiki_lingua",
|
80 |
+
"dataset_name": "en",
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_fmeasure_stderr": 0.0014806563009305842
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "GEM/wiki_lingua_en",
|
86 |
+
"prompt_name": "tldr_en",
|
87 |
+
"rougeLsum_precision": 0.0526446584524354,
|
88 |
+
"dataset_path": "GEM/wiki_lingua",
|
89 |
+
"dataset_name": "en",
|
90 |
+
"subset": null,
|
91 |
+
"rougeLsum_precision_stderr": 0.0018735818695187819
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "GEM/wiki_lingua_en",
|
95 |
+
"prompt_name": "tldr_en",
|
96 |
+
"rougeLsum_recall": 0.08977749314111799,
|
97 |
+
"dataset_path": "GEM/wiki_lingua",
|
98 |
+
"dataset_name": "en",
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_recall_stderr": 0.0030541817569144
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "GEM/wiki_lingua_en",
|
104 |
+
"prompt_name": "tldr_en",
|
105 |
+
"rougeLsum_fmeasure": 0.05884434373920439,
|
106 |
+
"dataset_path": "GEM/wiki_lingua",
|
107 |
+
"dataset_name": "en",
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.0019486127272992522
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "GEM/wiki_lingua_en",
|
113 |
+
"prompt_name": "tldr_en",
|
114 |
+
"bleu": 0.7653537900871676,
|
115 |
+
"dataset_path": "GEM/wiki_lingua",
|
116 |
+
"dataset_name": "en",
|
117 |
+
"subset": null,
|
118 |
+
"bleu_stderr": 0.049323102026813614
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 4,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "GEM/wiki_lingua_en",
|
5 |
+
"prompt_name": "tldr_en",
|
6 |
+
"rouge1_precision": 0.009989872401580602,
|
7 |
+
"dataset_path": "GEM/wiki_lingua",
|
8 |
+
"dataset_name": "en",
|
9 |
+
"subset": null,
|
10 |
+
"rouge1_precision_stderr": 0.0009952828000827124
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "GEM/wiki_lingua_en",
|
14 |
+
"prompt_name": "tldr_en",
|
15 |
+
"rouge1_recall": 0.015276156161555617,
|
16 |
+
"dataset_path": "GEM/wiki_lingua",
|
17 |
+
"dataset_name": "en",
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_recall_stderr": 0.001444671711873694
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "GEM/wiki_lingua_en",
|
23 |
+
"prompt_name": "tldr_en",
|
24 |
+
"rouge1_fmeasure": 0.010262809104423124,
|
25 |
+
"dataset_path": "GEM/wiki_lingua",
|
26 |
+
"dataset_name": "en",
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_fmeasure_stderr": 0.000941433577191994
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "GEM/wiki_lingua_en",
|
32 |
+
"prompt_name": "tldr_en",
|
33 |
+
"rouge2_precision": 0.0024167502513949756,
|
34 |
+
"dataset_path": "GEM/wiki_lingua",
|
35 |
+
"dataset_name": "en",
|
36 |
+
"subset": null,
|
37 |
+
"rouge2_precision_stderr": 0.00033798462070527304
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "GEM/wiki_lingua_en",
|
41 |
+
"prompt_name": "tldr_en",
|
42 |
+
"rouge2_recall": 0.004213844763908219,
|
43 |
+
"dataset_path": "GEM/wiki_lingua",
|
44 |
+
"dataset_name": "en",
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_recall_stderr": 0.0006183240014712593
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "GEM/wiki_lingua_en",
|
50 |
+
"prompt_name": "tldr_en",
|
51 |
+
"rouge2_fmeasure": 0.0025848086028049965,
|
52 |
+
"dataset_path": "GEM/wiki_lingua",
|
53 |
+
"dataset_name": "en",
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_fmeasure_stderr": 0.0003301678198930521
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "GEM/wiki_lingua_en",
|
59 |
+
"prompt_name": "tldr_en",
|
60 |
+
"rougeL_precision": 0.007350732687047713,
|
61 |
+
"dataset_path": "GEM/wiki_lingua",
|
62 |
+
"dataset_name": "en",
|
63 |
+
"subset": null,
|
64 |
+
"rougeL_precision_stderr": 0.000736612195363616
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "GEM/wiki_lingua_en",
|
68 |
+
"prompt_name": "tldr_en",
|
69 |
+
"rougeL_recall": 0.011512337859684543,
|
70 |
+
"dataset_path": "GEM/wiki_lingua",
|
71 |
+
"dataset_name": "en",
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_recall_stderr": 0.0011311946838375108
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "GEM/wiki_lingua_en",
|
77 |
+
"prompt_name": "tldr_en",
|
78 |
+
"rougeL_fmeasure": 0.007530769725575652,
|
79 |
+
"dataset_path": "GEM/wiki_lingua",
|
80 |
+
"dataset_name": "en",
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_fmeasure_stderr": 0.0006918159097560924
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "GEM/wiki_lingua_en",
|
86 |
+
"prompt_name": "tldr_en",
|
87 |
+
"rougeLsum_precision": 0.009401823549618816,
|
88 |
+
"dataset_path": "GEM/wiki_lingua",
|
89 |
+
"dataset_name": "en",
|
90 |
+
"subset": null,
|
91 |
+
"rougeLsum_precision_stderr": 0.0009477935776177291
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "GEM/wiki_lingua_en",
|
95 |
+
"prompt_name": "tldr_en",
|
96 |
+
"rougeLsum_recall": 0.014304994804442096,
|
97 |
+
"dataset_path": "GEM/wiki_lingua",
|
98 |
+
"dataset_name": "en",
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_recall_stderr": 0.0013529102153531946
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "GEM/wiki_lingua_en",
|
104 |
+
"prompt_name": "tldr_en",
|
105 |
+
"rougeLsum_fmeasure": 0.009601062566883777,
|
106 |
+
"dataset_path": "GEM/wiki_lingua",
|
107 |
+
"dataset_name": "en",
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.0008823539930029628
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "GEM/wiki_lingua_en",
|
113 |
+
"prompt_name": "tldr_en",
|
114 |
+
"bleu": 2.313122283850011e-06,
|
115 |
+
"dataset_path": "GEM/wiki_lingua",
|
116 |
+
"dataset_name": "en",
|
117 |
+
"subset": null,
|
118 |
+
"bleu_stderr": 4.438365983452618e-06
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 5,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "gem_xsum",
|
5 |
+
"prompt_name": "article_DOC_summary",
|
6 |
+
"rouge1_precision": 0.14077378985561326,
|
7 |
+
"dataset_path": "GEM/xsum",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": "",
|
10 |
+
"rouge1_precision_stderr": 0.002167627071465062
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "gem_xsum",
|
14 |
+
"prompt_name": "article_DOC_summary",
|
15 |
+
"rouge1_recall": 0.3365881713882994,
|
16 |
+
"dataset_path": "GEM/xsum",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": "",
|
19 |
+
"rouge1_recall_stderr": 0.004904093832075129
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "gem_xsum",
|
23 |
+
"prompt_name": "article_DOC_summary",
|
24 |
+
"rouge1_fmeasure": 0.19482488018157684,
|
25 |
+
"dataset_path": "GEM/xsum",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": "",
|
28 |
+
"rouge1_fmeasure_stderr": 0.0028449440307477085
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "gem_xsum",
|
32 |
+
"prompt_name": "article_DOC_summary",
|
33 |
+
"rouge2_precision": 0.03346907677060051,
|
34 |
+
"dataset_path": "GEM/xsum",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": "",
|
37 |
+
"rouge2_precision_stderr": 0.00115993469618386
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "gem_xsum",
|
41 |
+
"prompt_name": "article_DOC_summary",
|
42 |
+
"rouge2_recall": 0.08385926778368435,
|
43 |
+
"dataset_path": "GEM/xsum",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": "",
|
46 |
+
"rouge2_recall_stderr": 0.0030304591639245464
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "gem_xsum",
|
50 |
+
"prompt_name": "article_DOC_summary",
|
51 |
+
"rouge2_fmeasure": 0.04702347182983231,
|
52 |
+
"dataset_path": "GEM/xsum",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": "",
|
55 |
+
"rouge2_fmeasure_stderr": 0.001623530160402657
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "gem_xsum",
|
59 |
+
"prompt_name": "article_DOC_summary",
|
60 |
+
"rougeL_precision": 0.10579758272401656,
|
61 |
+
"dataset_path": "GEM/xsum",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": "",
|
64 |
+
"rougeL_precision_stderr": 0.0016080916023179224
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "gem_xsum",
|
68 |
+
"prompt_name": "article_DOC_summary",
|
69 |
+
"rougeL_recall": 0.25509688514423123,
|
70 |
+
"dataset_path": "GEM/xsum",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": "",
|
73 |
+
"rougeL_recall_stderr": 0.003826588022233898
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "gem_xsum",
|
77 |
+
"prompt_name": "article_DOC_summary",
|
78 |
+
"rougeL_fmeasure": 0.14665273300314152,
|
79 |
+
"dataset_path": "GEM/xsum",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": "",
|
82 |
+
"rougeL_fmeasure_stderr": 0.00212580713201847
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "gem_xsum",
|
86 |
+
"prompt_name": "article_DOC_summary",
|
87 |
+
"rougeLsum_precision": 0.11263878026618175,
|
88 |
+
"dataset_path": "GEM/xsum",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": "",
|
91 |
+
"rougeLsum_precision_stderr": 0.00176128619109869
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "gem_xsum",
|
95 |
+
"prompt_name": "article_DOC_summary",
|
96 |
+
"rougeLsum_recall": 0.2715816804884895,
|
97 |
+
"dataset_path": "GEM/xsum",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": "",
|
100 |
+
"rougeLsum_recall_stderr": 0.004199504898315826
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "gem_xsum",
|
104 |
+
"prompt_name": "article_DOC_summary",
|
105 |
+
"rougeLsum_fmeasure": 0.15621401458929957,
|
106 |
+
"dataset_path": "GEM/xsum",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": "",
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.00234420229722605
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "gem_xsum",
|
113 |
+
"prompt_name": "article_DOC_summary",
|
114 |
+
"bleu": 2.03791695157546,
|
115 |
+
"dataset_path": "GEM/xsum",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": "",
|
118 |
+
"bleu_stderr": 0.1342348737065576
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 3,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "gem_xsum",
|
5 |
+
"prompt_name": "article_DOC_summary",
|
6 |
+
"rouge1_precision": 0.04332722979741195,
|
7 |
+
"dataset_path": "GEM/xsum",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": "",
|
10 |
+
"rouge1_precision_stderr": 0.0025397484135024503
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "gem_xsum",
|
14 |
+
"prompt_name": "article_DOC_summary",
|
15 |
+
"rouge1_recall": 0.08638484320409213,
|
16 |
+
"dataset_path": "GEM/xsum",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": "",
|
19 |
+
"rouge1_recall_stderr": 0.004855712915740289
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "gem_xsum",
|
23 |
+
"prompt_name": "article_DOC_summary",
|
24 |
+
"rouge1_fmeasure": 0.05341124486563334,
|
25 |
+
"dataset_path": "GEM/xsum",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": "",
|
28 |
+
"rouge1_fmeasure_stderr": 0.0029118807193791816
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "gem_xsum",
|
32 |
+
"prompt_name": "article_DOC_summary",
|
33 |
+
"rouge2_precision": 0.009371982127377696,
|
34 |
+
"dataset_path": "GEM/xsum",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": "",
|
37 |
+
"rouge2_precision_stderr": 0.0007761604812612375
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "gem_xsum",
|
41 |
+
"prompt_name": "article_DOC_summary",
|
42 |
+
"rouge2_recall": 0.02197790550576286,
|
43 |
+
"dataset_path": "GEM/xsum",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": "",
|
46 |
+
"rouge2_recall_stderr": 0.0018737167719494205
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "gem_xsum",
|
50 |
+
"prompt_name": "article_DOC_summary",
|
51 |
+
"rouge2_fmeasure": 0.0126600494371876,
|
52 |
+
"dataset_path": "GEM/xsum",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": "",
|
55 |
+
"rouge2_fmeasure_stderr": 0.0010393927624132967
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "gem_xsum",
|
59 |
+
"prompt_name": "article_DOC_summary",
|
60 |
+
"rougeL_precision": 0.03308111037957417,
|
61 |
+
"dataset_path": "GEM/xsum",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": "",
|
64 |
+
"rougeL_precision_stderr": 0.002037483672167313
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "gem_xsum",
|
68 |
+
"prompt_name": "article_DOC_summary",
|
69 |
+
"rougeL_recall": 0.06547067873377617,
|
70 |
+
"dataset_path": "GEM/xsum",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": "",
|
73 |
+
"rougeL_recall_stderr": 0.0037402890017668913
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "gem_xsum",
|
77 |
+
"prompt_name": "article_DOC_summary",
|
78 |
+
"rougeL_fmeasure": 0.04031062592059575,
|
79 |
+
"dataset_path": "GEM/xsum",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": "",
|
82 |
+
"rougeL_fmeasure_stderr": 0.0022065055222514843
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "gem_xsum",
|
86 |
+
"prompt_name": "article_DOC_summary",
|
87 |
+
"rougeLsum_precision": 0.03547102946933422,
|
88 |
+
"dataset_path": "GEM/xsum",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": "",
|
91 |
+
"rougeLsum_precision_stderr": 0.002145817702406172
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "gem_xsum",
|
95 |
+
"prompt_name": "article_DOC_summary",
|
96 |
+
"rougeLsum_recall": 0.07066149952031933,
|
97 |
+
"dataset_path": "GEM/xsum",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": "",
|
100 |
+
"rougeLsum_recall_stderr": 0.004048133700252855
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "gem_xsum",
|
104 |
+
"prompt_name": "article_DOC_summary",
|
105 |
+
"rougeLsum_fmeasure": 0.04344922023460101,
|
106 |
+
"dataset_path": "GEM/xsum",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": "",
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.002384615926685988
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "gem_xsum",
|
113 |
+
"prompt_name": "article_DOC_summary",
|
114 |
+
"bleu": 0.9907623400783704,
|
115 |
+
"dataset_path": "GEM/xsum",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": "",
|
118 |
+
"bleu_stderr": 0.12144441541342028
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 4,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "gem_xsum",
|
5 |
+
"prompt_name": "article_DOC_summary",
|
6 |
+
"rouge1_precision": 0.002432491256372859,
|
7 |
+
"dataset_path": "GEM/xsum",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": "",
|
10 |
+
"rouge1_precision_stderr": 0.0006531802890059347
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "gem_xsum",
|
14 |
+
"prompt_name": "article_DOC_summary",
|
15 |
+
"rouge1_recall": 0.0035688165030904754,
|
16 |
+
"dataset_path": "GEM/xsum",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": "",
|
19 |
+
"rouge1_recall_stderr": 0.000969634171191206
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "gem_xsum",
|
23 |
+
"prompt_name": "article_DOC_summary",
|
24 |
+
"rouge1_fmeasure": 0.0026267053439145405,
|
25 |
+
"dataset_path": "GEM/xsum",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": "",
|
28 |
+
"rouge1_fmeasure_stderr": 0.0006851925438686376
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "gem_xsum",
|
32 |
+
"prompt_name": "article_DOC_summary",
|
33 |
+
"rouge2_precision": 0.0002462802129838755,
|
34 |
+
"dataset_path": "GEM/xsum",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": "",
|
37 |
+
"rouge2_precision_stderr": 0.00010248150275091556
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "gem_xsum",
|
41 |
+
"prompt_name": "article_DOC_summary",
|
42 |
+
"rouge2_recall": 0.00045873937383371344,
|
43 |
+
"dataset_path": "GEM/xsum",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": "",
|
46 |
+
"rouge2_recall_stderr": 0.0002027676865696476
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "gem_xsum",
|
50 |
+
"prompt_name": "article_DOC_summary",
|
51 |
+
"rouge2_fmeasure": 0.0003000575858304317,
|
52 |
+
"dataset_path": "GEM/xsum",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": "",
|
55 |
+
"rouge2_fmeasure_stderr": 0.00012455541465929192
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "gem_xsum",
|
59 |
+
"prompt_name": "article_DOC_summary",
|
60 |
+
"rougeL_precision": 0.001824459823195743,
|
61 |
+
"dataset_path": "GEM/xsum",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": "",
|
64 |
+
"rougeL_precision_stderr": 0.000493813415204384
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "gem_xsum",
|
68 |
+
"prompt_name": "article_DOC_summary",
|
69 |
+
"rougeL_recall": 0.002706254470313614,
|
70 |
+
"dataset_path": "GEM/xsum",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": "",
|
73 |
+
"rougeL_recall_stderr": 0.0007319861753860738
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "gem_xsum",
|
77 |
+
"prompt_name": "article_DOC_summary",
|
78 |
+
"rougeL_fmeasure": 0.001963800127619678,
|
79 |
+
"dataset_path": "GEM/xsum",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": "",
|
82 |
+
"rougeL_fmeasure_stderr": 0.0005077215919641312
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "gem_xsum",
|
86 |
+
"prompt_name": "article_DOC_summary",
|
87 |
+
"rougeLsum_precision": 0.001947339725356528,
|
88 |
+
"dataset_path": "GEM/xsum",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": "",
|
91 |
+
"rougeLsum_precision_stderr": 0.0005292516418451348
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "gem_xsum",
|
95 |
+
"prompt_name": "article_DOC_summary",
|
96 |
+
"rougeLsum_recall": 0.0028418243363945105,
|
97 |
+
"dataset_path": "GEM/xsum",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": "",
|
100 |
+
"rougeLsum_recall_stderr": 0.0007704086076680828
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "gem_xsum",
|
104 |
+
"prompt_name": "article_DOC_summary",
|
105 |
+
"rougeLsum_fmeasure": 0.002094899399917077,
|
106 |
+
"dataset_path": "GEM/xsum",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": "",
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.0005500941719652507
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "gem_xsum",
|
113 |
+
"prompt_name": "article_DOC_summary",
|
114 |
+
"bleu": 1.959625061740418e-19,
|
115 |
+
"dataset_path": "GEM/xsum",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": "",
|
118 |
+
"bleu_stderr": 1.1984703371139495e-15
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 5,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.35,0.01509065034144423,0
|
3 |
+
anli_r2,acc,0.346,0.015050266127564448,0
|
4 |
+
anli_r3,acc,0.355,0.013819249004047296,0
|
5 |
+
arc_challenge,acc,0.29180887372013653,0.013284525292403496,0
|
6 |
+
arc_challenge,acc_norm,0.3216723549488055,0.013650488084494162,0
|
7 |
+
arc_easy,acc,0.6220538720538721,0.009949405744045469,0
|
8 |
+
arc_easy,acc_norm,0.5395622895622896,0.010227616386289017,0
|
9 |
+
boolq,acc,0.6376146788990825,0.008407308655864048,1
|
10 |
+
cb,acc,0.26785714285714285,0.05971290310957636,1
|
11 |
+
cb,f1,0.2374338624338624,,1
|
12 |
+
copa,acc,0.81,0.03942772444036623,0
|
13 |
+
hellaswag,acc,0.5215096594303924,0.004985162074336112,0
|
14 |
+
hellaswag,acc_norm,0.6843258315076678,0.004638339207348913,0
|
15 |
+
piqa,acc,0.7627856365614799,0.009924694933586364,0
|
16 |
+
piqa,acc_norm,0.7747551686615887,0.00974664347103214,0
|
17 |
+
rte,acc,0.5270758122743683,0.030052303463143706,0
|
18 |
+
sciq,acc,0.87,0.010640169792499361,0
|
19 |
+
sciq,acc_norm,0.807,0.012486268734370145,0
|
20 |
+
storycloze_2016,acc,0.7455905932656334,0.010071542492663043,0
|
21 |
+
winogrande,acc,0.5659037095501184,0.013929882555694054,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-07-19_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.35,
|
5 |
-
"acc_stderr": 0.01509065034144423
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.346,
|
9 |
-
"acc_stderr": 0.015050266127564448
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.355,
|
13 |
-
"acc_stderr": 0.013819249004047296
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.26785714285714285,
|
17 |
-
"acc_stderr": 0.05971290310957636,
|
18 |
-
"f1": 0.2374338624338624
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.81,
|
22 |
-
"acc_stderr": 0.03942772444036623
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5215096594303924,
|
26 |
-
"acc_stderr": 0.004985162074336112,
|
27 |
-
"acc_norm": 0.6843258315076678,
|
28 |
-
"acc_norm_stderr": 0.004638339207348913
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5270758122743683,
|
32 |
-
"acc_stderr": 0.030052303463143706
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5659037095501184,
|
36 |
-
"acc_stderr": 0.013929882555694054
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7455905932656334,
|
40 |
-
"acc_stderr": 0.010071542492663043
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6376146788990825,
|
44 |
-
"acc_stderr": 0.008407308655864048
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6220538720538721,
|
48 |
-
"acc_stderr": 0.009949405744045469,
|
49 |
-
"acc_norm": 0.5395622895622896,
|
50 |
-
"acc_norm_stderr": 0.010227616386289017
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.29180887372013653,
|
54 |
-
"acc_stderr": 0.013284525292403496,
|
55 |
-
"acc_norm": 0.3216723549488055,
|
56 |
-
"acc_norm_stderr": 0.013650488084494162
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.87,
|
60 |
-
"acc_stderr": 0.010640169792499361,
|
61 |
-
"acc_norm": 0.807,
|
62 |
-
"acc_norm_stderr": 0.012486268734370145
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7627856365614799,
|
66 |
-
"acc_stderr": 0.009924694933586364,
|
67 |
-
"acc_norm": 0.7747551686615887,
|
68 |
-
"acc_norm_stderr": 0.00974664347103214
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.328,0.014853842487270334,0
|
3 |
+
anli_r2,acc,0.316,0.01470919305605713,0
|
4 |
+
anli_r3,acc,0.3591666666666667,0.013855141559780364,0
|
5 |
+
arc_challenge,acc,0.3054607508532423,0.0134600804780025,0
|
6 |
+
arc_challenge,acc_norm,0.3319112627986348,0.013760988200880538,0
|
7 |
+
arc_easy,acc,0.6422558922558923,0.009835772757343361,0
|
8 |
+
arc_easy,acc_norm,0.6035353535353535,0.010037412763064529,0
|
9 |
+
boolq,acc,0.6477064220183486,0.00835476049390613,1
|
10 |
+
cb,acc,0.26785714285714285,0.05971290310957635,1
|
11 |
+
cb,f1,0.21294539321104786,,1
|
12 |
+
copa,acc,0.79,0.040936018074033256,0
|
13 |
+
hellaswag,acc,0.5201155148376817,0.004985741706385719,0
|
14 |
+
hellaswag,acc_norm,0.6825333598884684,0.004645393477680675,0
|
15 |
+
piqa,acc,0.764961915125136,0.009893146688805326,0
|
16 |
+
piqa,acc_norm,0.7725788900979326,0.009779850767847232,0
|
17 |
+
rte,acc,0.4657039711191336,0.030025579819366426,0
|
18 |
+
sciq,acc,0.905,0.009276910103103329,0
|
19 |
+
sciq,acc_norm,0.88,0.0102813280127474,0
|
20 |
+
storycloze_2016,acc,0.7365045430251203,0.010187168219156485,0
|
21 |
+
winogrande,acc,0.5872138910812944,0.013837060648682103,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-07-19_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.328,
|
5 |
-
"acc_stderr": 0.014853842487270334
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.316,
|
9 |
-
"acc_stderr": 0.01470919305605713
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3591666666666667,
|
13 |
-
"acc_stderr": 0.013855141559780364
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.26785714285714285,
|
17 |
-
"acc_stderr": 0.05971290310957635,
|
18 |
-
"f1": 0.21294539321104786
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5201155148376817,
|
26 |
-
"acc_stderr": 0.004985741706385719,
|
27 |
-
"acc_norm": 0.6825333598884684,
|
28 |
-
"acc_norm_stderr": 0.004645393477680675
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4657039711191336,
|
32 |
-
"acc_stderr": 0.030025579819366426
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5872138910812944,
|
36 |
-
"acc_stderr": 0.013837060648682103
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7365045430251203,
|
40 |
-
"acc_stderr": 0.010187168219156485
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6477064220183486,
|
44 |
-
"acc_stderr": 0.00835476049390613
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6422558922558923,
|
48 |
-
"acc_stderr": 0.009835772757343361,
|
49 |
-
"acc_norm": 0.6035353535353535,
|
50 |
-
"acc_norm_stderr": 0.010037412763064529
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3054607508532423,
|
54 |
-
"acc_stderr": 0.0134600804780025,
|
55 |
-
"acc_norm": 0.3319112627986348,
|
56 |
-
"acc_norm_stderr": 0.013760988200880538
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.905,
|
60 |
-
"acc_stderr": 0.009276910103103329,
|
61 |
-
"acc_norm": 0.88,
|
62 |
-
"acc_norm_stderr": 0.0102813280127474
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.764961915125136,
|
66 |
-
"acc_stderr": 0.009893146688805326,
|
67 |
-
"acc_norm": 0.7725788900979326,
|
68 |
-
"acc_norm_stderr": 0.009779850767847232
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.336,0.014944140233795018,0
|
3 |
+
anli_r2,acc,0.329,0.01486539538592835,0
|
4 |
+
anli_r3,acc,0.35083333333333333,0.013782212417178195,0
|
5 |
+
arc_challenge,acc,0.310580204778157,0.013522292098053059,0
|
6 |
+
arc_challenge,acc_norm,0.33532423208191126,0.013796182947785562,0
|
7 |
+
arc_easy,acc,0.6481481481481481,0.009799078929868706,0
|
8 |
+
arc_easy,acc_norm,0.6212121212121212,0.00995373765654204,0
|
9 |
+
boolq,acc,0.6382262996941896,0.008404238796949254,1
|
10 |
+
cb,acc,0.19642857142857142,0.05357142857142858,1
|
11 |
+
cb,f1,0.1668300653594771,,1
|
12 |
+
copa,acc,0.78,0.04163331998932263,0
|
13 |
+
hellaswag,acc,0.5200159330810595,0.004985781620467012,0
|
14 |
+
hellaswag,acc_norm,0.6863174666401115,0.004630407476835209,0
|
15 |
+
piqa,acc,0.7573449401523396,0.0100020025697087,0
|
16 |
+
piqa,acc_norm,0.764961915125136,0.009893146688805319,0
|
17 |
+
rte,acc,0.48014440433212996,0.0300727231673172,0
|
18 |
+
sciq,acc,0.913,0.00891686663074591,0
|
19 |
+
sciq,acc_norm,0.889,0.009938701010583726,0
|
20 |
+
storycloze_2016,acc,0.743452699091395,0.01009926092771917,0
|
21 |
+
winogrande,acc,0.5864246250986582,0.013840971763195306,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-07-19_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.336,
|
5 |
-
"acc_stderr": 0.014944140233795018
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.329,
|
9 |
-
"acc_stderr": 0.01486539538592835
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.35083333333333333,
|
13 |
-
"acc_stderr": 0.013782212417178195
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.19642857142857142,
|
17 |
-
"acc_stderr": 0.05357142857142858,
|
18 |
-
"f1": 0.1668300653594771
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.78,
|
22 |
-
"acc_stderr": 0.04163331998932263
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5200159330810595,
|
26 |
-
"acc_stderr": 0.004985781620467012,
|
27 |
-
"acc_norm": 0.6863174666401115,
|
28 |
-
"acc_norm_stderr": 0.004630407476835209
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48014440433212996,
|
32 |
-
"acc_stderr": 0.0300727231673172
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5864246250986582,
|
36 |
-
"acc_stderr": 0.013840971763195306
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.743452699091395,
|
40 |
-
"acc_stderr": 0.01009926092771917
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6382262996941896,
|
44 |
-
"acc_stderr": 0.008404238796949254
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6481481481481481,
|
48 |
-
"acc_stderr": 0.009799078929868706,
|
49 |
-
"acc_norm": 0.6212121212121212,
|
50 |
-
"acc_norm_stderr": 0.00995373765654204
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.310580204778157,
|
54 |
-
"acc_stderr": 0.013522292098053059,
|
55 |
-
"acc_norm": 0.33532423208191126,
|
56 |
-
"acc_norm_stderr": 0.013796182947785562
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.913,
|
60 |
-
"acc_stderr": 0.00891686663074591,
|
61 |
-
"acc_norm": 0.889,
|
62 |
-
"acc_norm_stderr": 0.009938701010583726
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7573449401523396,
|
66 |
-
"acc_stderr": 0.0100020025697087,
|
67 |
-
"acc_norm": 0.764961915125136,
|
68 |
-
"acc_norm_stderr": 0.009893146688805319
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.348,0.01507060460376841,0
|
3 |
+
anli_r2,acc,0.344,0.015029633724408947,0
|
4 |
+
anli_r3,acc,0.3325,0.013605417345710528,0
|
5 |
+
arc_challenge,acc,0.3148464163822526,0.01357265770308495,0
|
6 |
+
arc_challenge,acc_norm,0.3250853242320819,0.013688147309729119,0
|
7 |
+
arc_easy,acc,0.648989898989899,0.009793703885101042,0
|
8 |
+
arc_easy,acc_norm,0.6393097643097643,0.009853512108416748,0
|
9 |
+
boolq,acc,0.6363914373088685,0.008413404209789989,1
|
10 |
+
cb,acc,0.3392857142857143,0.06384226561930825,1
|
11 |
+
cb,f1,0.3185837135128588,,1
|
12 |
+
copa,acc,0.78,0.04163331998932263,0
|
13 |
+
hellaswag,acc,0.5206134236207927,0.00498553915978342,0
|
14 |
+
hellaswag,acc_norm,0.6902011551483768,0.00461465517501001,0
|
15 |
+
piqa,acc,0.7584330794341676,0.009986718001804467,0
|
16 |
+
piqa,acc_norm,0.7665941240478781,0.009869247889520991,0
|
17 |
+
rte,acc,0.48014440433212996,0.0300727231673172,0
|
18 |
+
sciq,acc,0.917,0.008728527206074792,0
|
19 |
+
sciq,acc_norm,0.911,0.009008893392651521,0
|
20 |
+
storycloze_2016,acc,0.7504008551576697,0.010008002459430848,0
|
21 |
+
winogrande,acc,0.601420678768745,0.01376035717687383,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-07-25_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.348,
|
5 |
-
"acc_stderr": 0.01507060460376841
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.344,
|
9 |
-
"acc_stderr": 0.015029633724408947
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3325,
|
13 |
-
"acc_stderr": 0.013605417345710528
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.3392857142857143,
|
17 |
-
"acc_stderr": 0.06384226561930825,
|
18 |
-
"f1": 0.3185837135128588
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.78,
|
22 |
-
"acc_stderr": 0.04163331998932263
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5206134236207927,
|
26 |
-
"acc_stderr": 0.00498553915978342,
|
27 |
-
"acc_norm": 0.6902011551483768,
|
28 |
-
"acc_norm_stderr": 0.00461465517501001
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48014440433212996,
|
32 |
-
"acc_stderr": 0.0300727231673172
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.601420678768745,
|
36 |
-
"acc_stderr": 0.01376035717687383
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7504008551576697,
|
40 |
-
"acc_stderr": 0.010008002459430848
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6363914373088685,
|
44 |
-
"acc_stderr": 0.008413404209789989
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.648989898989899,
|
48 |
-
"acc_stderr": 0.009793703885101042,
|
49 |
-
"acc_norm": 0.6393097643097643,
|
50 |
-
"acc_norm_stderr": 0.009853512108416748
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3148464163822526,
|
54 |
-
"acc_stderr": 0.01357265770308495,
|
55 |
-
"acc_norm": 0.3250853242320819,
|
56 |
-
"acc_norm_stderr": 0.013688147309729119
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.917,
|
60 |
-
"acc_stderr": 0.008728527206074792,
|
61 |
-
"acc_norm": 0.911,
|
62 |
-
"acc_norm_stderr": 0.009008893392651521
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7584330794341676,
|
66 |
-
"acc_stderr": 0.009986718001804467,
|
67 |
-
"acc_norm": 0.7665941240478781,
|
68 |
-
"acc_norm_stderr": 0.009869247889520991
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.344,0.015029633724408947,0
|
3 |
+
anli_r2,acc,0.345,0.015039986742055238,0
|
4 |
+
anli_r3,acc,0.34833333333333333,0.013759437498874072,0
|
5 |
+
arc_challenge,acc,0.3199658703071672,0.013631345807016196,0
|
6 |
+
arc_challenge,acc_norm,0.3447098976109215,0.01388881628678211,0
|
7 |
+
arc_easy,acc,0.6632996632996633,0.009697166595752475,0
|
8 |
+
arc_easy,acc_norm,0.6447811447811448,0.009820245899287124,0
|
9 |
+
boolq,acc,0.636085626911315,0.00841491890912884,1
|
10 |
+
cb,acc,0.21428571428571427,0.055328333517248834,1
|
11 |
+
cb,f1,0.1997113997113997,,1
|
12 |
+
copa,acc,0.79,0.040936018074033256,0
|
13 |
+
hellaswag,acc,0.5206134236207927,0.004985539159783419,0
|
14 |
+
hellaswag,acc_norm,0.6900019916351324,0.004615472210316043,0
|
15 |
+
piqa,acc,0.7665941240478781,0.009869247889521007,0
|
16 |
+
piqa,acc_norm,0.7682263329706203,0.00984514377279404,0
|
17 |
+
rte,acc,0.48736462093862815,0.030086851767188564,0
|
18 |
+
sciq,acc,0.927,0.008230354715244062,0
|
19 |
+
sciq,acc_norm,0.908,0.009144376393151108,0
|
20 |
+
storycloze_2016,acc,0.7509353287012293,0.010000841162740146,0
|
21 |
+
winogrande,acc,0.6077348066298343,0.013722400462000888,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-06-46_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.344,
|
5 |
-
"acc_stderr": 0.015029633724408947
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.345,
|
9 |
-
"acc_stderr": 0.015039986742055238
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34833333333333333,
|
13 |
-
"acc_stderr": 0.013759437498874072
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.21428571428571427,
|
17 |
-
"acc_stderr": 0.055328333517248834,
|
18 |
-
"f1": 0.1997113997113997
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5206134236207927,
|
26 |
-
"acc_stderr": 0.004985539159783419,
|
27 |
-
"acc_norm": 0.6900019916351324,
|
28 |
-
"acc_norm_stderr": 0.004615472210316043
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48736462093862815,
|
32 |
-
"acc_stderr": 0.030086851767188564
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6077348066298343,
|
36 |
-
"acc_stderr": 0.013722400462000888
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7509353287012293,
|
40 |
-
"acc_stderr": 0.010000841162740146
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.636085626911315,
|
44 |
-
"acc_stderr": 0.00841491890912884
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6632996632996633,
|
48 |
-
"acc_stderr": 0.009697166595752475,
|
49 |
-
"acc_norm": 0.6447811447811448,
|
50 |
-
"acc_norm_stderr": 0.009820245899287124
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3199658703071672,
|
54 |
-
"acc_stderr": 0.013631345807016196,
|
55 |
-
"acc_norm": 0.3447098976109215,
|
56 |
-
"acc_norm_stderr": 0.01388881628678211
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.927,
|
60 |
-
"acc_stderr": 0.008230354715244062,
|
61 |
-
"acc_norm": 0.908,
|
62 |
-
"acc_norm_stderr": 0.009144376393151108
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7665941240478781,
|
66 |
-
"acc_stderr": 0.009869247889521007,
|
67 |
-
"acc_norm": 0.7682263329706203,
|
68 |
-
"acc_norm_stderr": 0.00984514377279404
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.34,0.014987482264363935,0
|
3 |
+
anli_r2,acc,0.337,0.014955087918653605,0
|
4 |
+
anli_r3,acc,0.3325,0.013605417345710526,0
|
5 |
+
arc_challenge,acc,0.3310580204778157,0.013752062419817834,0
|
6 |
+
arc_challenge,acc_norm,0.3447098976109215,0.013888816286782112,0
|
7 |
+
arc_easy,acc,0.6536195286195287,0.009763542075695738,0
|
8 |
+
arc_easy,acc_norm,0.6401515151515151,0.009848484848484836,0
|
9 |
+
boolq,acc,0.6409785932721712,0.008390241754319908,1
|
10 |
+
cb,acc,0.23214285714285715,0.0569293902400011,1
|
11 |
+
cb,f1,0.223351041141572,,1
|
12 |
+
copa,acc,0.76,0.042923469599092816,0
|
13 |
+
hellaswag,acc,0.5209121688906593,0.004985415250690905,0
|
14 |
+
hellaswag,acc_norm,0.689205337582155,0.004618730353217064,0
|
15 |
+
piqa,acc,0.7606093579978237,0.009955884250291688,0
|
16 |
+
piqa,acc_norm,0.7747551686615887,0.009746643471032136,0
|
17 |
+
rte,acc,0.5054151624548736,0.030094698123239966,0
|
18 |
+
sciq,acc,0.923,0.008434580140240648,0
|
19 |
+
sciq,acc_norm,0.912,0.008963053962592074,0
|
20 |
+
storycloze_2016,acc,0.7498663816141101,0.010015143382536456,0
|
21 |
+
winogrande,acc,0.5935280189423836,0.01380444869775337,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-46_5shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.34,
|
5 |
-
"acc_stderr": 0.014987482264363935
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.337,
|
9 |
-
"acc_stderr": 0.014955087918653605
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3325,
|
13 |
-
"acc_stderr": 0.013605417345710526
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.23214285714285715,
|
17 |
-
"acc_stderr": 0.0569293902400011,
|
18 |
-
"f1": 0.223351041141572
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.76,
|
22 |
-
"acc_stderr": 0.042923469599092816
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5209121688906593,
|
26 |
-
"acc_stderr": 0.004985415250690905,
|
27 |
-
"acc_norm": 0.689205337582155,
|
28 |
-
"acc_norm_stderr": 0.004618730353217064
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5054151624548736,
|
32 |
-
"acc_stderr": 0.030094698123239966
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5935280189423836,
|
36 |
-
"acc_stderr": 0.01380444869775337
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7498663816141101,
|
40 |
-
"acc_stderr": 0.010015143382536456
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6409785932721712,
|
44 |
-
"acc_stderr": 0.008390241754319908
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6536195286195287,
|
48 |
-
"acc_stderr": 0.009763542075695738,
|
49 |
-
"acc_norm": 0.6401515151515151,
|
50 |
-
"acc_norm_stderr": 0.009848484848484836
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3310580204778157,
|
54 |
-
"acc_stderr": 0.013752062419817834,
|
55 |
-
"acc_norm": 0.3447098976109215,
|
56 |
-
"acc_norm_stderr": 0.013888816286782112
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.923,
|
60 |
-
"acc_stderr": 0.008434580140240648,
|
61 |
-
"acc_norm": 0.912,
|
62 |
-
"acc_norm_stderr": 0.008963053962592074
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7606093579978237,
|
66 |
-
"acc_stderr": 0.009955884250291688,
|
67 |
-
"acc_norm": 0.7747551686615887,
|
68 |
-
"acc_norm_stderr": 0.009746643471032136
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/transformers/merges.txt
ADDED
File without changes
|
8b7178b13b/transformers/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
8b7178b13b/transformers/vocab.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783
|
3 |
+
size 1042301
|
8b7178b178b/evaluation/8b7178b178b_0_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 0,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b178b/evaluation/8b7178b178b_1_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 1,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b178b/evaluation/8b7178b178b_2_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 2,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b178b/evaluation/8b7178b178b_3_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 3,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b178b/evaluation/8b7178b178b_4_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 4,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b178b/evaluation/8b7178b178b_5_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 5,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|