Muennighoff
commited on
Commit
•
71f6ad1
1
Parent(s):
2e5ca64
Ad@
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 8b7178b13b/evaluation/8b7178b13b_1_babi.json +22 -0
- 8b7178b13b/evaluation/8b7178b13b_2_babi.json +22 -0
- 8b7178b13b/evaluation/8b7178b13b_3_babi.json +22 -0
- 8b7178b13b/evaluation/8b7178b13b_4_babi.json +22 -0
- 8b7178b13b/evaluation/8b7178b13b_5_babi.json +22 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json +1 -0
- 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json +1 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl +3 -0
- 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl +3 -0
- 8b7178b13b/evaluation/generation/merged.csv +53 -0
- 8b7178b13b/evaluation/generation/merged.json +1 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json +133 -0
- 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json +133 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-07-32_4shots_backup.json +0 -87
- 8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv +21 -0
- 8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-37_5shots_backup.json +0 -87
- 8b7178b178b/evaluation/8b7178b178b_1_babi.json +22 -0
- 8b7178b178b/evaluation/8b7178b178b_2_babi.json +22 -0
- 8b7178b178b/evaluation/8b7178b178b_3_babi.json +22 -0
- 8b7178b178b/evaluation/8b7178b178b_4_babi.json +22 -0
- 8b7178b178b/evaluation/8b7178b178b_5_babi.json +22 -0
- 8b7178b25b/evaluation/8b7178b25b_0_babi.json +22 -0
- 8b7178b25b/evaluation/8b7178b25b_1_babi.json +22 -0
- 8b7178b25b/evaluation/8b7178b25b_2_babi.json +22 -0
- 8b7178b25b/evaluation/8b7178b25b_3_babi.json +22 -0
- 8b7178b25b/evaluation/8b7178b25b_4_babi.json +22 -0
8b7178b13b/evaluation/8b7178b13b_1_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.10966666666666666,
|
5 |
+
"em_stderr": 0.005705916414010263
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 1,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/8b7178b13b_2_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.20533333333333334,
|
5 |
+
"em_stderr": 0.007376222253753254
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 2,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/8b7178b13b_3_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.258,
|
5 |
+
"em_stderr": 0.007989573064892506
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 3,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/8b7178b13b_4_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.2843333333333333,
|
5 |
+
"em_stderr": 0.008237227300544015
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 4,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/8b7178b13b_5_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.30933333333333335,
|
5 |
+
"em_stderr": 0.008440329009701236
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
|
14 |
+
"num_fewshot": 5,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.29175676576014364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034471150363075184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2607907068748222, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002898135289732422}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.23794097030152866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002188642411213601}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08536913371644045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020895235460299017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0713925019948092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001635476891768791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06574644928335649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013625725128389179}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.22160256648226004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027745204374263316}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19691795695851566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002270661716223799}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1789951285071888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001673666200127008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.27421900185303183, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003293382529360642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.24426464874342324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002735802329870702}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.22300315725329103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020678687486032563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.071985313483549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07738970826466988}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2571754504445173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037874180373320154}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22046706474081984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032483649489955437}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20409132873325006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026059526618525918}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07716310931244513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021423793043653955}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06255207307386669, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001665836586847627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05803958336813696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013905246714186119}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.19917050763282695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031058368371087975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.16863998232504207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025563859107073333}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1557150280292743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020175919818759087}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.24286851428671166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003627579238772214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2070808578907387, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003069864948055298}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19198507442281348, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002473123474513908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.256877193820952, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07385158775345343}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.08608412704276916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031073010197710128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07306085845324418, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026739004311174368}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06754366655458334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00231821642571142}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02477632495929693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013952242546163309}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.020963360092888926, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011677531292808372}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01891360756998624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009547131510927643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06705402166283986, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024891810743964266}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05633331544121819, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021123430958068772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05180637434259422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017937325092342497}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08098856968063572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002946288914451725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06822240197436494, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002506023867308553}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06316340092042942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021760431723604587}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.11803096947131637, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.018616113127493047}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.014929032882940911, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001437252981426829}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.012605454696432882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012515184538477262}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.01167038909001005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010944129890059014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0041010832248453595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005815077800226312}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0034469110258534192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004725161504235943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.003203820995720566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00042103771194584703}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.011742235186401492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011661092630625284}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009919184883424567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010141555758155579}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.009059396220908112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008608509358819988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.014255925532109718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001380098431266019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.012021925711305772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001202720217240228}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.011096298465596519, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010440702662319144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 5.860100637257344e-12, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0205550894986597e-10}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.21592444335846092, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004246983520225218}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.27255737663479357, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00447437166346409}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.22044943741927525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003500198780987625}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.05330214126583765, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025012370800294183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06515904722605939, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025535061286592504}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05351615652078515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022197484992897607}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.16172792138960806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003474612361597086}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.20248110828398325, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034801799909718704}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16421475353571632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028136325952485735}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16634670992387487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034675307885691244}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.21304752670955557, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003802772338811888}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17052032174118803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002883605135295133}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.4283658434676942, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1084231496409544}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06147088600466444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003807688121301648}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.06919835565602203, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004080878740872531}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05798370899016291, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003309673469340258}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.014228533929974868, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014047860954449383}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0166314025722418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014602317545186565}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.013641962174467587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012106481387976049}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.04649724897774529, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029768758298026627}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.05168291535676844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003094715210052394}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.043392440237132235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002526920635642375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.048035895047127686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003028118969042768}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.05464177271239397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033104672738008553}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04537985187674532, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026361408489140024}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.40228156399700404, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0795066489605694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002305114359249611, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006855365150103617}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002688750909485673, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008419301936781596}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0022410464211047414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006379815102748624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00023662681974673803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00010805221445763836}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0003191847290509596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00013624996945101362}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0002428157678792438, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00010121603253679161}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0014967981731443785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00044282917196946015}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0018089244853048499, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005955143349649119}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001480383954453164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004335775279500401}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0014611359517881007, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004268628859757995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0017987055420995693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005977174922462606}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0014592568247182058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00042833093506174205}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.2194816597556972e-29, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.9673150803997765e-21}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d50b98ce786c24110901e7da0bfb8f2f0b1a8997df2194229acd5bc122dc22d
|
3 |
+
size 18605434
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6fcc32b2a7a9a4ec85ad6b084bb458f744025984db325f5f2f95bc51178c1ef9
|
3 |
+
size 24070805
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff0281a58876590084570bea0b8feb099efce7b6bd5f1dded593af2c7f2d0146
|
3 |
+
size 29380490
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f9209c03a21b26044ca2675e69b17b1e5cd47f660be0226fba1abdd49d99233
|
3 |
+
size 34786842
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d38077208e1f7fd684e78d720c69585bf6441502b4381400558f457020508dfa
|
3 |
+
size 9525984
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ebf648f5afd69404494ab6936ba7266cd146373c249af58da4c8975a49b6c660
|
3 |
+
size 11646229
|
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd422c7e3808f695d266dfee392525cea2dbb5f620848c74d4478f52bf23f26f
|
3 |
+
size 13898127
|
8b7178b13b/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.13957033665597848
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.13957033665597848
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.21971297989413593
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.21971297989413593
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2436824998963185
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2436824998963185
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2526618416523279
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2526618416523279
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2559926229244319
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2559926229244319
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.259556048619835
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.259556048619835
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.22852938827383795
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.021176795907655737
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.021176795907655737
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.050641256552544464
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.050641256552544464
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05359901469303583
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.05359901469303583
|
21 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05351615652078515
|
22 |
+
gem_xsum,3,median,rouge2_fmeasure,0.05351615652078515
|
23 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.013641962174467587
|
24 |
+
gem_xsum,4,median,rouge2_fmeasure,0.013641962174467587
|
25 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0002428157678792438
|
26 |
+
gem_xsum,5,median,rouge2_fmeasure,0.0002428157678792438
|
27 |
+
gem_xsum,5,average,multiple,0.032136333602728
|
28 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.047608492095606206
|
29 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.047608492095606206
|
30 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08120001213349398
|
31 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.08120001213349398
|
32 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.11056055329882854
|
33 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.11056055329882854
|
34 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.12608234413935157
|
35 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.12608234413935157
|
36 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.1354322190391961
|
37 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.1354322190391961
|
38 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.1419903076188946
|
39 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.1419903076188946
|
40 |
+
web_nlg_en,5,average,multiple,0.10714565472089517
|
41 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04284409748743667
|
42 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.04284409748743667
|
43 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05107550928141819
|
44 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.05107550928141819
|
45 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06574644928335649
|
46 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.06574644928335649
|
47 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05803958336813696
|
48 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.05803958336813696
|
49 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01891360756998624
|
50 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.01891360756998624
|
51 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.003203820995720566
|
52 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.003203820995720566
|
53 |
+
wiki_lingua_en,5,average,multiple,0.039970511331009186
|
8b7178b13b/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4522375218061915, "bleu_stderr": 0.04842537846111568, "rouge1_fmeasure": 0.10639641446726286, "rouge1_fmeasure_stderr": 0.002372783014156511, "rouge1_precision": 0.09405156373772382, "rouge1_precision_stderr": 0.0036589357455526954, "rouge1_recall": 0.2623187491640316, "rouge1_recall_stderr": 0.0054853417994958405, "rouge2_fmeasure": 0.047608492095606206, "rouge2_fmeasure_stderr": 0.001384256918874843, "rouge2_precision": 0.04001471903798784, "rouge2_precision_stderr": 0.002167193193860632, "rouge2_recall": 0.12482367655474384, "rouge2_recall_stderr": 0.003329638916496446, "rougeL_fmeasure": 0.09811489729413486, "rougeL_fmeasure_stderr": 0.002141247906215953, "rougeL_precision": 0.08735958257324784, "rougeL_precision_stderr": 0.0035013157983101876, "rougeL_recall": 0.2450465857818092, "rougeL_recall_stderr": 0.005156857986010396, "rougeLsum_fmeasure": 0.0992851030641096, "rougeLsum_fmeasure_stderr": 0.0022026560740801633, "rougeLsum_precision": 0.08882938411931994, "rougeLsum_precision_stderr": 0.0035668764760974714, "rougeLsum_recall": 0.24452754279908495, "rougeLsum_recall_stderr": 0.005065510424462396}}, "1": {"PALM_prompt": {"bleu": 0.650629434960658, "bleu_stderr": 0.04811417898690189, "rouge1_fmeasure": 0.16281032775122675, "rouge1_fmeasure_stderr": 0.003905335709899557, "rouge1_precision": 0.15916782201345867, "rouge1_precision_stderr": 0.005131870879954661, "rouge1_recall": 0.29665725707782414, "rouge1_recall_stderr": 0.005228319001095806, "rouge2_fmeasure": 0.08120001213349398, "rouge2_fmeasure_stderr": 0.0026664765201647343, "rouge2_precision": 0.07819648278455865, "rouge2_precision_stderr": 0.0034063589997107026, "rouge2_recall": 0.15070155918572603, "rouge2_recall_stderr": 0.003623729048520002, "rougeL_fmeasure": 0.14574992383080743, "rougeL_fmeasure_stderr": 0.003397730356308293, "rougeL_precision": 0.14191196636841233, "rougeL_precision_stderr": 0.004635280920281775, "rougeL_recall": 0.2736224046058516, "rougeL_recall_stderr": 0.004816339822450663, "rougeLsum_fmeasure": 0.14966591828719222, "rougeLsum_fmeasure_stderr": 0.003504137587849123, "rougeLsum_precision": 0.14648404402072937, "rougeLsum_precision_stderr": 0.004776844963861229, "rougeLsum_recall": 0.2774966849512831, "rougeLsum_recall_stderr": 0.004834496186187873}}, "2": {"PALM_prompt": {"bleu": 0.9046528025555703, "bleu_stderr": 0.04317473166983863, "rouge1_fmeasure": 0.2126025964717363, "rouge1_fmeasure_stderr": 0.004487229116514054, "rouge1_precision": 0.21014915270499354, "rouge1_precision_stderr": 0.005813053226743566, "rouge1_recall": 0.35653672883727167, "rouge1_recall_stderr": 0.0051494656409290275, "rouge2_fmeasure": 0.11056055329882854, "rouge2_fmeasure_stderr": 0.0031590233571128605, "rouge2_precision": 0.11091162929767316, "rouge2_precision_stderr": 0.003989043309470051, "rouge2_recall": 0.18668399640135247, "rouge2_recall_stderr": 0.0039177512048306565, "rougeL_fmeasure": 0.18634690018126043, "rougeL_fmeasure_stderr": 0.0038448807973693925, "rougeL_precision": 0.1825110333764981, "rougeL_precision_stderr": 0.005070346564582974, "rougeL_recall": 0.32304322748014636, "rougeL_recall_stderr": 0.004678142532303707, "rougeLsum_fmeasure": 0.19202573601368517, "rougeLsum_fmeasure_stderr": 0.003952591052036519, "rougeLsum_precision": 0.18939634311269352, "rougeLsum_precision_stderr": 0.005258445068941222, "rougeLsum_recall": 0.32966626591290826, "rougeLsum_recall_stderr": 0.004724502434373412}}, "3": {"PALM_prompt": {"bleu": 1.1442800703272336, "bleu_stderr": 0.05161996592570338, "rouge1_fmeasure": 0.23508991254207284, "rouge1_fmeasure_stderr": 0.004629614568550998, "rouge1_precision": 0.23213625362665477, "rouge1_precision_stderr": 0.0059610834558063595, "rouge1_recall": 0.38062292481677, "rouge1_recall_stderr": 0.005078282219671234, "rouge2_fmeasure": 0.12608234413935157, "rouge2_fmeasure_stderr": 0.003256601130342381, "rouge2_precision": 0.12811335412630678, "rouge2_precision_stderr": 0.004200144761611546, "rouge2_recall": 0.20379204157806646, "rouge2_recall_stderr": 0.00386170912683119, "rougeL_fmeasure": 0.20320672155464048, "rougeL_fmeasure_stderr": 0.003882337416765662, "rougeL_precision": 0.1997394107463207, "rougeL_precision_stderr": 0.005153079376868987, "rougeL_recall": 0.3397942315323993, "rougeL_recall_stderr": 0.004538527029075319, "rougeLsum_fmeasure": 0.21069756332821474, "rougeLsum_fmeasure_stderr": 0.004028196190821567, "rougeLsum_precision": 0.2083043346123531, "rougeLsum_precision_stderr": 0.005363455263356501, "rougeLsum_recall": 0.347929987856493, "rougeLsum_recall_stderr": 0.0045764471921679985}}, "4": {"PALM_prompt": {"bleu": 1.3793783786539846, "bleu_stderr": 0.08292725693632953, "rouge1_fmeasure": 0.25107613979636906, "rouge1_fmeasure_stderr": 0.00476323540590602, "rouge1_precision": 0.25146617993410997, "rouge1_precision_stderr": 0.006199020027337637, "rouge1_recall": 0.3925026283739189, "rouge1_recall_stderr": 0.005116605241451321, "rouge2_fmeasure": 0.1354322190391961, "rouge2_fmeasure_stderr": 0.003385535533668589, "rouge2_precision": 0.1385226844845104, "rouge2_precision_stderr": 0.004282956786141916, "rouge2_recall": 0.21131194271802842, "rouge2_recall_stderr": 0.0040385627196912076, "rougeL_fmeasure": 0.2160535699510597, "rougeL_fmeasure_stderr": 0.003987483818543123, "rougeL_precision": 0.21425855000222013, "rougeL_precision_stderr": 0.005261027090142587, "rougeL_recall": 0.34988090714764264, "rougeL_recall_stderr": 0.004590444333954342, "rougeLsum_fmeasure": 0.22537108240785111, "rougeLsum_fmeasure_stderr": 0.004170782290413772, "rougeLsum_precision": 0.22545655976991655, "rougeLsum_precision_stderr": 0.0055589913005229, "rougeLsum_recall": 0.36010176303537716, "rougeLsum_recall_stderr": 0.0046603281582664625}}, "5": {"PALM_prompt": {"bleu": 1.4145829791141218, "bleu_stderr": 0.06756850302725374, "rouge1_fmeasure": 0.25726246836968625, "rouge1_fmeasure_stderr": 0.004880864665323261, "rouge1_precision": 0.25559388874117145, "rouge1_precision_stderr": 0.006265188037282814, "rouge1_recall": 0.4014517195417289, "rouge1_recall_stderr": 0.0052097160639095145, "rouge2_fmeasure": 0.1419903076188946, "rouge2_fmeasure_stderr": 0.003491421839159055, "rouge2_precision": 0.1462902529413211, "rouge2_precision_stderr": 0.004502888503688928, "rouge2_recall": 0.22067604236758182, "rouge2_recall_stderr": 0.0041554431177298085, "rougeL_fmeasure": 0.22145748379950564, "rougeL_fmeasure_stderr": 0.004104691248611666, "rougeL_precision": 0.21838534521953784, "rougeL_precision_stderr": 0.005386484846358847, "rougeL_recall": 0.35807492888309944, "rougeL_recall_stderr": 0.004713215605602484, "rougeLsum_fmeasure": 0.2308946484376556, "rougeLsum_fmeasure_stderr": 0.004281312884008834, "rougeLsum_precision": 0.22909884605448813, "rougeLsum_precision_stderr": 0.00564116547513353, "rougeLsum_recall": 0.3686456415754311, "rougeLsum_recall_stderr": 0.004784333349134369}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.5768645983512664, "bleu_stderr": 0.10390732312462625, "rouge1_fmeasure": 0.1810278543145764, "rouge1_fmeasure_stderr": 0.002359129908964426, "rouge1_precision": 0.1631004447259747, "rouge1_precision_stderr": 0.0025140049334374266, "rouge1_recall": 0.24869580468026348, "rouge1_recall_stderr": 0.0032277820635021014, "rouge2_fmeasure": 0.04284409748743667, "rouge2_fmeasure_stderr": 0.001036773633130634, "rouge2_precision": 0.0385111191751266, "rouge2_precision_stderr": 0.0009986244610694906, "rouge2_recall": 0.05950986614858929, "rouge2_recall_stderr": 0.0015515657908224004, "rougeL_fmeasure": 0.1323060407446597, "rougeL_fmeasure_stderr": 0.001666744596513707, "rougeL_precision": 0.11812150163422086, "rougeL_precision_stderr": 0.0017909292945280444, "rougeL_recall": 0.18668011105548613, "rougeL_recall_stderr": 0.002526858332113489, "rougeLsum_fmeasure": 0.16822692441451978, "rougeLsum_fmeasure_stderr": 0.0021965424695315023, "rougeLsum_precision": 0.15149118025181801, "rougeLsum_precision_stderr": 0.0023436532932745767, "rougeLsum_recall": 0.23193053314919995, "rougeLsum_recall_stderr": 0.003047509010443981}}, "1": {"tldr_en": {"bleu": 3.2805418289148096, "bleu_stderr": 0.06930736886640082, "rouge1_fmeasure": 0.20446918503496525, "rouge1_fmeasure_stderr": 0.0022069372036890365, "rouge1_precision": 0.24381809168774637, "rouge1_precision_stderr": 0.0033022582237546634, "rouge1_recall": 0.23017687741596388, "rouge1_recall_stderr": 0.0028624354435880044, "rouge2_fmeasure": 0.05107550928141819, "rouge2_fmeasure_stderr": 0.0012372719079850197, "rouge2_precision": 0.064059693409422, "rouge2_precision_stderr": 0.0018318331544901892, "rouge2_recall": 0.05717654707897076, "rouge2_recall_stderr": 0.0015139726468610508, "rougeL_fmeasure": 0.15417240714255895, "rougeL_fmeasure_stderr": 0.001662532523275427, "rougeL_precision": 0.18545125118813593, "rougeL_precision_stderr": 0.0026173569724847225, "rougeL_recall": 0.1747030421481033, "rougeL_recall_stderr": 0.0022396644779822898, "rougeLsum_fmeasure": 0.19083537889844354, "rougeLsum_fmeasure_stderr": 0.002054035558945731, "rougeLsum_precision": 0.2280337645476899, "rougeLsum_precision_stderr": 0.003112015382305472, "rougeLsum_recall": 0.21469429192649783, "rougeLsum_recall_stderr": 0.0026546326180329014}}, "2": {"tldr_en": {"bleu": 4.071985313483549, "bleu_stderr": 0.07738970826466988, "rouge1_fmeasure": 0.23794097030152866, "rouge1_fmeasure_stderr": 0.002188642411213601, "rouge1_precision": 0.29175676576014364, "rouge1_precision_stderr": 0.0034471150363075184, "rouge1_recall": 0.2607907068748222, "rouge1_recall_stderr": 0.002898135289732422, "rouge2_fmeasure": 0.06574644928335649, "rouge2_fmeasure_stderr": 0.0013625725128389179, "rouge2_precision": 0.08536913371644045, "rouge2_precision_stderr": 0.0020895235460299017, "rouge2_recall": 0.0713925019948092, "rouge2_recall_stderr": 0.001635476891768791, "rougeL_fmeasure": 0.1789951285071888, "rougeL_fmeasure_stderr": 0.001673666200127008, "rougeL_precision": 0.22160256648226004, "rougeL_precision_stderr": 0.0027745204374263316, "rougeL_recall": 0.19691795695851566, "rougeL_recall_stderr": 0.002270661716223799, "rougeLsum_fmeasure": 0.22300315725329103, "rougeLsum_fmeasure_stderr": 0.0020678687486032563, "rougeLsum_precision": 0.27421900185303183, "rougeLsum_precision_stderr": 0.003293382529360642, "rougeLsum_recall": 0.24426464874342324, "rougeLsum_recall_stderr": 0.002735802329870702}}, "3": {"tldr_en": {"bleu": 3.256877193820952, "bleu_stderr": 0.07385158775345343, "rouge1_fmeasure": 0.20409132873325006, "rouge1_fmeasure_stderr": 0.0026059526618525918, "rouge1_precision": 0.2571754504445173, "rouge1_precision_stderr": 0.0037874180373320154, "rouge1_recall": 0.22046706474081984, "rouge1_recall_stderr": 0.0032483649489955437, "rouge2_fmeasure": 0.05803958336813696, "rouge2_fmeasure_stderr": 0.0013905246714186119, "rouge2_precision": 0.07716310931244513, "rouge2_precision_stderr": 0.0021423793043653955, "rouge2_recall": 0.06255207307386669, "rouge2_recall_stderr": 0.001665836586847627, "rougeL_fmeasure": 0.1557150280292743, "rougeL_fmeasure_stderr": 0.0020175919818759087, "rougeL_precision": 0.19917050763282695, "rougeL_precision_stderr": 0.0031058368371087975, "rougeL_recall": 0.16863998232504207, "rougeL_recall_stderr": 0.0025563859107073333, "rougeLsum_fmeasure": 0.19198507442281348, "rougeLsum_fmeasure_stderr": 0.002473123474513908, "rougeLsum_precision": 0.24286851428671166, "rougeLsum_precision_stderr": 0.003627579238772214, "rougeLsum_recall": 0.2070808578907387, "rougeLsum_recall_stderr": 0.003069864948055298}}, "4": {"tldr_en": {"bleu": 0.11803096947131637, "bleu_stderr": 0.018616113127493047, "rouge1_fmeasure": 0.06754366655458334, "rouge1_fmeasure_stderr": 0.00231821642571142, "rouge1_precision": 0.08608412704276916, "rouge1_precision_stderr": 0.0031073010197710128, "rouge1_recall": 0.07306085845324418, "rouge1_recall_stderr": 0.0026739004311174368, "rouge2_fmeasure": 0.01891360756998624, "rouge2_fmeasure_stderr": 0.0009547131510927643, "rouge2_precision": 0.02477632495929693, "rouge2_precision_stderr": 0.0013952242546163309, "rouge2_recall": 0.020963360092888926, "rouge2_recall_stderr": 0.0011677531292808372, "rougeL_fmeasure": 0.05180637434259422, "rougeL_fmeasure_stderr": 0.0017937325092342497, "rougeL_precision": 0.06705402166283986, "rougeL_precision_stderr": 0.0024891810743964266, "rougeL_recall": 0.05633331544121819, "rougeL_recall_stderr": 0.0021123430958068772, "rougeLsum_fmeasure": 0.06316340092042942, "rougeLsum_fmeasure_stderr": 0.0021760431723604587, "rougeLsum_precision": 0.08098856968063572, "rougeLsum_precision_stderr": 0.002946288914451725, "rougeLsum_recall": 0.06822240197436494, "rougeLsum_recall_stderr": 0.002506023867308553}}, "5": {"tldr_en": {"bleu": 5.860100637257344e-12, "bleu_stderr": 1.0205550894986597e-10, "rouge1_fmeasure": 0.01167038909001005, "rouge1_fmeasure_stderr": 0.0010944129890059014, "rouge1_precision": 0.014929032882940911, "rouge1_precision_stderr": 0.001437252981426829, "rouge1_recall": 0.012605454696432882, "rouge1_recall_stderr": 0.0012515184538477262, "rouge2_fmeasure": 0.003203820995720566, "rouge2_fmeasure_stderr": 0.00042103771194584703, "rouge2_precision": 0.0041010832248453595, "rouge2_precision_stderr": 0.0005815077800226312, "rouge2_recall": 0.0034469110258534192, "rouge2_recall_stderr": 0.0004725161504235943, "rougeL_fmeasure": 0.009059396220908112, "rougeL_fmeasure_stderr": 0.0008608509358819988, "rougeL_precision": 0.011742235186401492, "rougeL_precision_stderr": 0.0011661092630625284, "rougeL_recall": 0.009919184883424567, "rougeL_recall_stderr": 0.0010141555758155579, "rougeLsum_fmeasure": 0.011096298465596519, "rougeLsum_fmeasure_stderr": 0.0010440702662319144, "rougeLsum_precision": 0.014255925532109718, "rougeLsum_precision_stderr": 0.001380098431266019, "rougeLsum_recall": 0.012021925711305772, "rougeLsum_recall_stderr": 0.001202720217240228}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 7.518228772851459, "bleu_stderr": 0.07988589142118296, "rouge1_fmeasure": 0.30892622395914193, "rouge1_fmeasure_stderr": 0.002362726614809891, "rouge1_precision": 0.2527585937479932, "rouge1_precision_stderr": 0.0019586691047001085, "rouge1_recall": 0.4532685157753863, "rouge1_recall_stderr": 0.0036911339160002263, "rouge2_fmeasure": 0.13957033665597848, "rouge2_fmeasure_stderr": 0.001587978757027655, "rouge2_precision": 0.10827294752994317, "rouge2_precision_stderr": 0.0013193546727602984, "rouge2_recall": 0.21206177532278597, "rouge2_recall_stderr": 0.0024697648114498094, "rougeL_fmeasure": 0.25201104517465034, "rougeL_fmeasure_stderr": 0.0018344182759923726, "rougeL_precision": 0.2074761734719538, "rougeL_precision_stderr": 0.0015484435087844478, "rougeL_recall": 0.3707049758227277, "rougeL_recall_stderr": 0.002992917144452656, "rougeLsum_fmeasure": 0.2721367746591264, "rougeLsum_fmeasure_stderr": 0.002175228967316158, "rougeLsum_precision": 0.22367271169207667, "rougeLsum_precision_stderr": 0.0018161195853035582, "rougeLsum_recall": 0.3994503148641263, "rougeLsum_recall_stderr": 0.0034245642845668594}}, "1": {"generate_text_restaurant": {"bleu": 11.762713013802438, "bleu_stderr": 0.09382576543199773, "rouge1_fmeasure": 0.4698359144705449, "rouge1_fmeasure_stderr": 0.0023428195562317463, "rouge1_precision": 0.5758411505584643, "rouge1_precision_stderr": 0.0032641614095776606, "rouge1_recall": 0.43699974124480134, "rouge1_recall_stderr": 0.0030360517355224794, "rouge2_fmeasure": 0.21971297989413593, "rouge2_fmeasure_stderr": 0.0020517231278393165, "rouge2_precision": 0.27339290904893143, "rouge2_precision_stderr": 0.0027333405330301377, "rouge2_recall": 0.20425825992656632, "rouge2_recall_stderr": 0.0021962926617281136, "rougeL_fmeasure": 0.3397129377482095, "rougeL_fmeasure_stderr": 0.0020808579737579473, "rougeL_precision": 0.4200242586753886, "rougeL_precision_stderr": 0.0030278428531837543, "rougeL_recall": 0.31470569352158173, "rougeL_recall_stderr": 0.002441278894908242, "rougeLsum_fmeasure": 0.3826629288347234, "rougeLsum_fmeasure_stderr": 0.0023388359390624446, "rougeLsum_precision": 0.4702024862705025, "rougeLsum_precision_stderr": 0.0032151475190549403, "rougeLsum_recall": 0.3555458059380285, "rougeLsum_recall_stderr": 0.0027838026083691135}}, "2": {"generate_text_restaurant": {"bleu": 14.028619911508963, "bleu_stderr": 0.17673898243267264, "rouge1_fmeasure": 0.4982696315856735, "rouge1_fmeasure_stderr": 0.002277178664730148, "rouge1_precision": 0.5880328986485642, "rouge1_precision_stderr": 0.0031633644314478966, "rouge1_recall": 0.4720615710167169, "rouge1_recall_stderr": 0.0030042579734499103, "rouge2_fmeasure": 0.2436824998963185, "rouge2_fmeasure_stderr": 0.0021479952999434505, "rouge2_precision": 0.2905416018050732, "rouge2_precision_stderr": 0.0027458331588556565, "rouge2_recall": 0.2313719045642725, "rouge2_recall_stderr": 0.002349650773654754, "rougeL_fmeasure": 0.36315580245879386, "rougeL_fmeasure_stderr": 0.0021103667472393136, "rougeL_precision": 0.4304826242102334, "rougeL_precision_stderr": 0.002921537898047091, "rougeL_recall": 0.34359634632916913, "rougeL_recall_stderr": 0.002538014950574236, "rougeLsum_fmeasure": 0.41230819320152157, "rougeLsum_fmeasure_stderr": 0.0023628359694983633, "rougeLsum_precision": 0.4866663175226494, "rougeLsum_precision_stderr": 0.003134734940667762, "rougeLsum_recall": 0.39070560600420967, "rougeLsum_recall_stderr": 0.002859660656155989}}, "3": {"generate_text_restaurant": {"bleu": 14.824429124321526, "bleu_stderr": 0.1257638219512965, "rouge1_fmeasure": 0.5070937995389966, "rouge1_fmeasure_stderr": 0.0021993659376666874, "rouge1_precision": 0.5903839264246126, "rouge1_precision_stderr": 0.003096593178519806, "rouge1_recall": 0.48303754455261133, "rouge1_recall_stderr": 0.0029313776925495253, "rouge2_fmeasure": 0.2526618416523279, "rouge2_fmeasure_stderr": 0.0021298129637362085, "rouge2_precision": 0.2971819596009762, "rouge2_precision_stderr": 0.0027204840509740583, "rouge2_recall": 0.24110048435208156, "rouge2_recall_stderr": 0.002348630487706421, "rougeL_fmeasure": 0.37097982222923104, "rougeL_fmeasure_stderr": 0.00214061071940383, "rougeL_precision": 0.43347382731719464, "rougeL_precision_stderr": 0.0029257357889719074, "rougeL_recall": 0.3529852168172252, "rougeL_recall_stderr": 0.002550242635255893, "rougeLsum_fmeasure": 0.42203281188344766, "rougeLsum_fmeasure_stderr": 0.0023652228936698657, "rougeLsum_precision": 0.49123394352656125, "rougeLsum_precision_stderr": 0.0031248395828196904, "rougeLsum_recall": 0.4020764458847546, "rougeLsum_recall_stderr": 0.0028446825862533704}}, "4": {"generate_text_restaurant": {"bleu": 15.1456115809235, "bleu_stderr": 0.1628753687758124, "rouge1_fmeasure": 0.5145793651843434, "rouge1_fmeasure_stderr": 0.002244489356919281, "rouge1_precision": 0.5945998345487046, "rouge1_precision_stderr": 0.003074613754297493, "rouge1_recall": 0.48854055206810637, "rouge1_recall_stderr": 0.0028822641580179367, "rouge2_fmeasure": 0.2559926229244319, "rouge2_fmeasure_stderr": 0.002186356460577112, "rouge2_precision": 0.29784097926829245, "rouge2_precision_stderr": 0.0026941678409656135, "rouge2_recall": 0.2436337287979087, "rouge2_recall_stderr": 0.0023716503152493335, "rougeL_fmeasure": 0.3747326640922719, "rougeL_fmeasure_stderr": 0.002148508706418628, "rougeL_precision": 0.433653082379793, "rougeL_precision_stderr": 0.002833488672483008, "rougeL_recall": 0.3558030676746678, "rougeL_recall_stderr": 0.0025159494686373797, "rougeLsum_fmeasure": 0.42740717670424816, "rougeLsum_fmeasure_stderr": 0.0023775373903584217, "rougeLsum_precision": 0.49357503577481815, "rougeLsum_precision_stderr": 0.0030663612183832903, "rougeLsum_recall": 0.4058018094790397, "rougeLsum_recall_stderr": 0.002791634703514705}}, "5": {"generate_text_restaurant": {"bleu": 15.184590176560544, "bleu_stderr": 0.17570671649096134, "rouge1_fmeasure": 0.5172310679190623, "rouge1_fmeasure_stderr": 0.002187662941341786, "rouge1_precision": 0.5980619294865127, "rouge1_precision_stderr": 0.003055147702862821, "rouge1_recall": 0.4899430329979205, "rouge1_recall_stderr": 0.002838541142993511, "rouge2_fmeasure": 0.259556048619835, "rouge2_fmeasure_stderr": 0.002143271876534031, "rouge2_precision": 0.3029600136699186, "rouge2_precision_stderr": 0.002703515351522382, "rouge2_recall": 0.24620276831802643, "rouge2_recall_stderr": 0.0023264378571411816, "rougeL_fmeasure": 0.37807698267399703, "rougeL_fmeasure_stderr": 0.002126725638562394, "rougeL_precision": 0.4380103729999666, "rougeL_precision_stderr": 0.0028424300466825983, "rougeL_recall": 0.35795640195755224, "rougeL_recall_stderr": 0.0024826420662082673, "rougeLsum_fmeasure": 0.4311732922825964, "rougeLsum_fmeasure_stderr": 0.0023482937755900097, "rougeLsum_precision": 0.4982942932710782, "rougeLsum_precision_stderr": 0.0030594319184573462, "rougeLsum_recall": 0.40852131422574206, "rougeLsum_recall_stderr": 0.0027745302682206834}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.0840616150366835, "bleu_stderr": 0.10220605780455636, "rouge1_fmeasure": 0.12067679782231505, "rouge1_fmeasure_stderr": 0.003015872334492621, "rouge1_precision": 0.09160247708817207, "rouge1_precision_stderr": 0.002375172469912284, "rouge1_recall": 0.19613476599918667, "rouge1_recall_stderr": 0.005073318605110471, "rouge2_fmeasure": 0.021176795907655737, "rouge2_fmeasure_stderr": 0.0011603757977784665, "rouge2_precision": 0.015487063309988504, "rouge2_precision_stderr": 0.000848981732321504, "rouge2_recall": 0.0363837944571209, "rouge2_recall_stderr": 0.0020790710888273895, "rougeL_fmeasure": 0.09086726074181874, "rougeL_fmeasure_stderr": 0.0022256142858678965, "rougeL_precision": 0.06937634282139736, "rougeL_precision_stderr": 0.001813605739492327, "rougeL_recall": 0.1479963053053532, "rougeL_recall_stderr": 0.0038099944087791725, "rougeLsum_fmeasure": 0.09880097411831448, "rougeLsum_fmeasure_stderr": 0.0024614730448775535, "rougeLsum_precision": 0.07512656050190324, "rougeLsum_precision_stderr": 0.001959507720128778, "rougeLsum_recall": 0.1612082948452722, "rougeLsum_recall_stderr": 0.004219489865399336}}, "1": {"article_DOC_summary": {"bleu": 2.1144830791719307, "bleu_stderr": 0.11445499213864452, "rouge1_fmeasure": 0.22223912621847974, "rouge1_fmeasure_stderr": 0.003038126692025604, "rouge1_precision": 0.2003179694226108, "rouge1_precision_stderr": 0.003646294986793581, "rouge1_recall": 0.3082326819925303, "rouge1_recall_stderr": 0.004421626418186149, "rouge2_fmeasure": 0.050641256552544464, "rouge2_fmeasure_stderr": 0.002016315458850625, "rouge2_precision": 0.04617082546967428, "rouge2_precision_stderr": 0.0021567164563758494, "rouge2_recall": 0.07074171325045796, "rouge2_recall_stderr": 0.0026782882522819163, "rougeL_fmeasure": 0.16589957489115423, "rougeL_fmeasure_stderr": 0.0024349153804753905, "rougeL_precision": 0.15020466828774898, "rougeL_precision_stderr": 0.002974138334891651, "rougeL_recall": 0.23037723268498672, "rougeL_recall_stderr": 0.0034903634923284257, "rougeLsum_fmeasure": 0.17413975005815832, "rougeLsum_fmeasure_stderr": 0.0025334465295397523, "rougeLsum_precision": 0.156190697593295, "rougeLsum_precision_stderr": 0.0029737943854797597, "rougeLsum_recall": 0.2443326358730322, "rougeLsum_recall_stderr": 0.003851152935036917}}, "2": {"article_DOC_summary": {"bleu": 2.2306956500254795, "bleu_stderr": 0.1534426529740081, "rouge1_fmeasure": 0.22874247828238384, "rouge1_fmeasure_stderr": 0.003191868132841587, "rouge1_precision": 0.2167785998186487, "rouge1_precision_stderr": 0.0038703703581663036, "rouge1_recall": 0.29079354607238944, "rouge1_recall_stderr": 0.0042086230091149205, "rouge2_fmeasure": 0.05359901469303583, "rouge2_fmeasure_stderr": 0.002063273466751458, "rouge2_precision": 0.05175962394844965, "rouge2_precision_stderr": 0.002262457698273439, "rouge2_recall": 0.06767519675126271, "rouge2_recall_stderr": 0.002531468931801855, "rougeL_fmeasure": 0.17116883513373068, "rougeL_fmeasure_stderr": 0.002601631095778569, "rougeL_precision": 0.1626685190790419, "rougeL_precision_stderr": 0.003181540362928691, "rougeL_recall": 0.2176434055764335, "rougeL_recall_stderr": 0.0033468188991687773, "rougeLsum_fmeasure": 0.17781283216267912, "rougeLsum_fmeasure_stderr": 0.0026319963165001175, "rougeLsum_precision": 0.16752103628195428, "rougeLsum_precision_stderr": 0.0031476246457933605, "rougeLsum_recall": 0.2286412427632142, "rougeLsum_recall_stderr": 0.003582394216085152}}, "3": {"article_DOC_summary": {"bleu": 2.4283658434676942, "bleu_stderr": 0.1084231496409544, "rouge1_fmeasure": 0.22044943741927525, "rouge1_fmeasure_stderr": 0.003500198780987625, "rouge1_precision": 0.21592444335846092, "rouge1_precision_stderr": 0.004246983520225218, "rouge1_recall": 0.27255737663479357, "rouge1_recall_stderr": 0.00447437166346409, "rouge2_fmeasure": 0.05351615652078515, "rouge2_fmeasure_stderr": 0.0022197484992897607, "rouge2_precision": 0.05330214126583765, "rouge2_precision_stderr": 0.0025012370800294183, "rouge2_recall": 0.06515904722605939, "rouge2_recall_stderr": 0.0025535061286592504, "rougeL_fmeasure": 0.16421475353571632, "rougeL_fmeasure_stderr": 0.0028136325952485735, "rougeL_precision": 0.16172792138960806, "rougeL_precision_stderr": 0.003474612361597086, "rougeL_recall": 0.20248110828398325, "rougeL_recall_stderr": 0.0034801799909718704, "rougeLsum_fmeasure": 0.17052032174118803, "rougeLsum_fmeasure_stderr": 0.002883605135295133, "rougeLsum_precision": 0.16634670992387487, "rougeLsum_precision_stderr": 0.0034675307885691244, "rougeLsum_recall": 0.21304752670955557, "rougeLsum_recall_stderr": 0.003802772338811888}}, "4": {"article_DOC_summary": {"bleu": 0.40228156399700404, "bleu_stderr": 0.0795066489605694, "rouge1_fmeasure": 0.05798370899016291, "rouge1_fmeasure_stderr": 0.003309673469340258, "rouge1_precision": 0.06147088600466444, "rouge1_precision_stderr": 0.003807688121301648, "rouge1_recall": 0.06919835565602203, "rouge1_recall_stderr": 0.004080878740872531, "rouge2_fmeasure": 0.013641962174467587, "rouge2_fmeasure_stderr": 0.0012106481387976049, "rouge2_precision": 0.014228533929974868, "rouge2_precision_stderr": 0.0014047860954449383, "rouge2_recall": 0.0166314025722418, "rouge2_recall_stderr": 0.0014602317545186565, "rougeL_fmeasure": 0.043392440237132235, "rougeL_fmeasure_stderr": 0.002526920635642375, "rougeL_precision": 0.04649724897774529, "rougeL_precision_stderr": 0.0029768758298026627, "rougeL_recall": 0.05168291535676844, "rougeL_recall_stderr": 0.003094715210052394, "rougeLsum_fmeasure": 0.04537985187674532, "rougeLsum_fmeasure_stderr": 0.0026361408489140024, "rougeLsum_precision": 0.048035895047127686, "rougeLsum_precision_stderr": 0.003028118969042768, "rougeLsum_recall": 0.05464177271239397, "rougeLsum_recall_stderr": 0.0033104672738008553}}, "5": {"article_DOC_summary": {"bleu": 1.2194816597556972e-29, "bleu_stderr": 1.9673150803997765e-21, "rouge1_fmeasure": 0.0022410464211047414, "rouge1_fmeasure_stderr": 0.0006379815102748624, "rouge1_precision": 0.002305114359249611, "rouge1_precision_stderr": 0.0006855365150103617, "rouge1_recall": 0.002688750909485673, "rouge1_recall_stderr": 0.0008419301936781596, "rouge2_fmeasure": 0.0002428157678792438, "rouge2_fmeasure_stderr": 0.00010121603253679161, "rouge2_precision": 0.00023662681974673803, "rouge2_precision_stderr": 0.00010805221445763836, "rouge2_recall": 0.0003191847290509596, "rouge2_recall_stderr": 0.00013624996945101362, "rougeL_fmeasure": 0.001480383954453164, "rougeL_fmeasure_stderr": 0.0004335775279500401, "rougeL_precision": 0.0014967981731443785, "rougeL_precision_stderr": 0.00044282917196946015, "rougeL_recall": 0.0018089244853048499, "rougeL_recall_stderr": 0.0005955143349649119, "rougeLsum_fmeasure": 0.0014592568247182058, "rougeLsum_fmeasure_stderr": 0.00042833093506174205, "rougeLsum_precision": 0.0014611359517881007, "rougeLsum_precision_stderr": 0.0004268628859757995, "rougeLsum_recall": 0.0017987055420995693, "rougeLsum_recall_stderr": 0.0005977174922462606}}}}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "GEM/wiki_lingua_en",
|
5 |
+
"prompt_name": "tldr_en",
|
6 |
+
"rouge1_precision": 0.29175676576014364,
|
7 |
+
"dataset_path": "GEM/wiki_lingua",
|
8 |
+
"dataset_name": "en",
|
9 |
+
"subset": null,
|
10 |
+
"rouge1_precision_stderr": 0.0034471150363075184
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "GEM/wiki_lingua_en",
|
14 |
+
"prompt_name": "tldr_en",
|
15 |
+
"rouge1_recall": 0.2607907068748222,
|
16 |
+
"dataset_path": "GEM/wiki_lingua",
|
17 |
+
"dataset_name": "en",
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_recall_stderr": 0.002898135289732422
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "GEM/wiki_lingua_en",
|
23 |
+
"prompt_name": "tldr_en",
|
24 |
+
"rouge1_fmeasure": 0.23794097030152866,
|
25 |
+
"dataset_path": "GEM/wiki_lingua",
|
26 |
+
"dataset_name": "en",
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_fmeasure_stderr": 0.002188642411213601
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "GEM/wiki_lingua_en",
|
32 |
+
"prompt_name": "tldr_en",
|
33 |
+
"rouge2_precision": 0.08536913371644045,
|
34 |
+
"dataset_path": "GEM/wiki_lingua",
|
35 |
+
"dataset_name": "en",
|
36 |
+
"subset": null,
|
37 |
+
"rouge2_precision_stderr": 0.0020895235460299017
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "GEM/wiki_lingua_en",
|
41 |
+
"prompt_name": "tldr_en",
|
42 |
+
"rouge2_recall": 0.0713925019948092,
|
43 |
+
"dataset_path": "GEM/wiki_lingua",
|
44 |
+
"dataset_name": "en",
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_recall_stderr": 0.001635476891768791
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "GEM/wiki_lingua_en",
|
50 |
+
"prompt_name": "tldr_en",
|
51 |
+
"rouge2_fmeasure": 0.06574644928335649,
|
52 |
+
"dataset_path": "GEM/wiki_lingua",
|
53 |
+
"dataset_name": "en",
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_fmeasure_stderr": 0.0013625725128389179
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "GEM/wiki_lingua_en",
|
59 |
+
"prompt_name": "tldr_en",
|
60 |
+
"rougeL_precision": 0.22160256648226004,
|
61 |
+
"dataset_path": "GEM/wiki_lingua",
|
62 |
+
"dataset_name": "en",
|
63 |
+
"subset": null,
|
64 |
+
"rougeL_precision_stderr": 0.0027745204374263316
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "GEM/wiki_lingua_en",
|
68 |
+
"prompt_name": "tldr_en",
|
69 |
+
"rougeL_recall": 0.19691795695851566,
|
70 |
+
"dataset_path": "GEM/wiki_lingua",
|
71 |
+
"dataset_name": "en",
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_recall_stderr": 0.002270661716223799
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "GEM/wiki_lingua_en",
|
77 |
+
"prompt_name": "tldr_en",
|
78 |
+
"rougeL_fmeasure": 0.1789951285071888,
|
79 |
+
"dataset_path": "GEM/wiki_lingua",
|
80 |
+
"dataset_name": "en",
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_fmeasure_stderr": 0.001673666200127008
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "GEM/wiki_lingua_en",
|
86 |
+
"prompt_name": "tldr_en",
|
87 |
+
"rougeLsum_precision": 0.27421900185303183,
|
88 |
+
"dataset_path": "GEM/wiki_lingua",
|
89 |
+
"dataset_name": "en",
|
90 |
+
"subset": null,
|
91 |
+
"rougeLsum_precision_stderr": 0.003293382529360642
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "GEM/wiki_lingua_en",
|
95 |
+
"prompt_name": "tldr_en",
|
96 |
+
"rougeLsum_recall": 0.24426464874342324,
|
97 |
+
"dataset_path": "GEM/wiki_lingua",
|
98 |
+
"dataset_name": "en",
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_recall_stderr": 0.002735802329870702
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "GEM/wiki_lingua_en",
|
104 |
+
"prompt_name": "tldr_en",
|
105 |
+
"rougeLsum_fmeasure": 0.22300315725329103,
|
106 |
+
"dataset_path": "GEM/wiki_lingua",
|
107 |
+
"dataset_name": "en",
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.0020678687486032563
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "GEM/wiki_lingua_en",
|
113 |
+
"prompt_name": "tldr_en",
|
114 |
+
"bleu": 4.071985313483549,
|
115 |
+
"dataset_path": "GEM/wiki_lingua",
|
116 |
+
"dataset_name": "en",
|
117 |
+
"subset": null,
|
118 |
+
"bleu_stderr": 0.07738970826466988
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 2,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "GEM/wiki_lingua_en",
|
5 |
+
"prompt_name": "tldr_en",
|
6 |
+
"rouge1_precision": 0.2571754504445173,
|
7 |
+
"dataset_path": "GEM/wiki_lingua",
|
8 |
+
"dataset_name": "en",
|
9 |
+
"subset": null,
|
10 |
+
"rouge1_precision_stderr": 0.0037874180373320154
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "GEM/wiki_lingua_en",
|
14 |
+
"prompt_name": "tldr_en",
|
15 |
+
"rouge1_recall": 0.22046706474081984,
|
16 |
+
"dataset_path": "GEM/wiki_lingua",
|
17 |
+
"dataset_name": "en",
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_recall_stderr": 0.0032483649489955437
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "GEM/wiki_lingua_en",
|
23 |
+
"prompt_name": "tldr_en",
|
24 |
+
"rouge1_fmeasure": 0.20409132873325006,
|
25 |
+
"dataset_path": "GEM/wiki_lingua",
|
26 |
+
"dataset_name": "en",
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_fmeasure_stderr": 0.0026059526618525918
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "GEM/wiki_lingua_en",
|
32 |
+
"prompt_name": "tldr_en",
|
33 |
+
"rouge2_precision": 0.07716310931244513,
|
34 |
+
"dataset_path": "GEM/wiki_lingua",
|
35 |
+
"dataset_name": "en",
|
36 |
+
"subset": null,
|
37 |
+
"rouge2_precision_stderr": 0.0021423793043653955
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "GEM/wiki_lingua_en",
|
41 |
+
"prompt_name": "tldr_en",
|
42 |
+
"rouge2_recall": 0.06255207307386669,
|
43 |
+
"dataset_path": "GEM/wiki_lingua",
|
44 |
+
"dataset_name": "en",
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_recall_stderr": 0.001665836586847627
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "GEM/wiki_lingua_en",
|
50 |
+
"prompt_name": "tldr_en",
|
51 |
+
"rouge2_fmeasure": 0.05803958336813696,
|
52 |
+
"dataset_path": "GEM/wiki_lingua",
|
53 |
+
"dataset_name": "en",
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_fmeasure_stderr": 0.0013905246714186119
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "GEM/wiki_lingua_en",
|
59 |
+
"prompt_name": "tldr_en",
|
60 |
+
"rougeL_precision": 0.19917050763282695,
|
61 |
+
"dataset_path": "GEM/wiki_lingua",
|
62 |
+
"dataset_name": "en",
|
63 |
+
"subset": null,
|
64 |
+
"rougeL_precision_stderr": 0.0031058368371087975
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "GEM/wiki_lingua_en",
|
68 |
+
"prompt_name": "tldr_en",
|
69 |
+
"rougeL_recall": 0.16863998232504207,
|
70 |
+
"dataset_path": "GEM/wiki_lingua",
|
71 |
+
"dataset_name": "en",
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_recall_stderr": 0.0025563859107073333
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "GEM/wiki_lingua_en",
|
77 |
+
"prompt_name": "tldr_en",
|
78 |
+
"rougeL_fmeasure": 0.1557150280292743,
|
79 |
+
"dataset_path": "GEM/wiki_lingua",
|
80 |
+
"dataset_name": "en",
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_fmeasure_stderr": 0.0020175919818759087
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "GEM/wiki_lingua_en",
|
86 |
+
"prompt_name": "tldr_en",
|
87 |
+
"rougeLsum_precision": 0.24286851428671166,
|
88 |
+
"dataset_path": "GEM/wiki_lingua",
|
89 |
+
"dataset_name": "en",
|
90 |
+
"subset": null,
|
91 |
+
"rougeLsum_precision_stderr": 0.003627579238772214
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "GEM/wiki_lingua_en",
|
95 |
+
"prompt_name": "tldr_en",
|
96 |
+
"rougeLsum_recall": 0.2070808578907387,
|
97 |
+
"dataset_path": "GEM/wiki_lingua",
|
98 |
+
"dataset_name": "en",
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_recall_stderr": 0.003069864948055298
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "GEM/wiki_lingua_en",
|
104 |
+
"prompt_name": "tldr_en",
|
105 |
+
"rougeLsum_fmeasure": 0.19198507442281348,
|
106 |
+
"dataset_path": "GEM/wiki_lingua",
|
107 |
+
"dataset_name": "en",
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.002473123474513908
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "GEM/wiki_lingua_en",
|
113 |
+
"prompt_name": "tldr_en",
|
114 |
+
"bleu": 3.256877193820952,
|
115 |
+
"dataset_path": "GEM/wiki_lingua",
|
116 |
+
"dataset_name": "en",
|
117 |
+
"subset": null,
|
118 |
+
"bleu_stderr": 0.07385158775345343
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 3,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "GEM/wiki_lingua_en",
|
5 |
+
"prompt_name": "tldr_en",
|
6 |
+
"rouge1_precision": 0.08608412704276916,
|
7 |
+
"dataset_path": "GEM/wiki_lingua",
|
8 |
+
"dataset_name": "en",
|
9 |
+
"subset": null,
|
10 |
+
"rouge1_precision_stderr": 0.0031073010197710128
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "GEM/wiki_lingua_en",
|
14 |
+
"prompt_name": "tldr_en",
|
15 |
+
"rouge1_recall": 0.07306085845324418,
|
16 |
+
"dataset_path": "GEM/wiki_lingua",
|
17 |
+
"dataset_name": "en",
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_recall_stderr": 0.0026739004311174368
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "GEM/wiki_lingua_en",
|
23 |
+
"prompt_name": "tldr_en",
|
24 |
+
"rouge1_fmeasure": 0.06754366655458334,
|
25 |
+
"dataset_path": "GEM/wiki_lingua",
|
26 |
+
"dataset_name": "en",
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_fmeasure_stderr": 0.00231821642571142
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "GEM/wiki_lingua_en",
|
32 |
+
"prompt_name": "tldr_en",
|
33 |
+
"rouge2_precision": 0.02477632495929693,
|
34 |
+
"dataset_path": "GEM/wiki_lingua",
|
35 |
+
"dataset_name": "en",
|
36 |
+
"subset": null,
|
37 |
+
"rouge2_precision_stderr": 0.0013952242546163309
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "GEM/wiki_lingua_en",
|
41 |
+
"prompt_name": "tldr_en",
|
42 |
+
"rouge2_recall": 0.020963360092888926,
|
43 |
+
"dataset_path": "GEM/wiki_lingua",
|
44 |
+
"dataset_name": "en",
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_recall_stderr": 0.0011677531292808372
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "GEM/wiki_lingua_en",
|
50 |
+
"prompt_name": "tldr_en",
|
51 |
+
"rouge2_fmeasure": 0.01891360756998624,
|
52 |
+
"dataset_path": "GEM/wiki_lingua",
|
53 |
+
"dataset_name": "en",
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_fmeasure_stderr": 0.0009547131510927643
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "GEM/wiki_lingua_en",
|
59 |
+
"prompt_name": "tldr_en",
|
60 |
+
"rougeL_precision": 0.06705402166283986,
|
61 |
+
"dataset_path": "GEM/wiki_lingua",
|
62 |
+
"dataset_name": "en",
|
63 |
+
"subset": null,
|
64 |
+
"rougeL_precision_stderr": 0.0024891810743964266
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "GEM/wiki_lingua_en",
|
68 |
+
"prompt_name": "tldr_en",
|
69 |
+
"rougeL_recall": 0.05633331544121819,
|
70 |
+
"dataset_path": "GEM/wiki_lingua",
|
71 |
+
"dataset_name": "en",
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_recall_stderr": 0.0021123430958068772
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "GEM/wiki_lingua_en",
|
77 |
+
"prompt_name": "tldr_en",
|
78 |
+
"rougeL_fmeasure": 0.05180637434259422,
|
79 |
+
"dataset_path": "GEM/wiki_lingua",
|
80 |
+
"dataset_name": "en",
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_fmeasure_stderr": 0.0017937325092342497
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "GEM/wiki_lingua_en",
|
86 |
+
"prompt_name": "tldr_en",
|
87 |
+
"rougeLsum_precision": 0.08098856968063572,
|
88 |
+
"dataset_path": "GEM/wiki_lingua",
|
89 |
+
"dataset_name": "en",
|
90 |
+
"subset": null,
|
91 |
+
"rougeLsum_precision_stderr": 0.002946288914451725
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "GEM/wiki_lingua_en",
|
95 |
+
"prompt_name": "tldr_en",
|
96 |
+
"rougeLsum_recall": 0.06822240197436494,
|
97 |
+
"dataset_path": "GEM/wiki_lingua",
|
98 |
+
"dataset_name": "en",
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_recall_stderr": 0.002506023867308553
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "GEM/wiki_lingua_en",
|
104 |
+
"prompt_name": "tldr_en",
|
105 |
+
"rougeLsum_fmeasure": 0.06316340092042942,
|
106 |
+
"dataset_path": "GEM/wiki_lingua",
|
107 |
+
"dataset_name": "en",
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.0021760431723604587
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "GEM/wiki_lingua_en",
|
113 |
+
"prompt_name": "tldr_en",
|
114 |
+
"bleu": 0.11803096947131637,
|
115 |
+
"dataset_path": "GEM/wiki_lingua",
|
116 |
+
"dataset_name": "en",
|
117 |
+
"subset": null,
|
118 |
+
"bleu_stderr": 0.018616113127493047
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 4,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "GEM/wiki_lingua_en",
|
5 |
+
"prompt_name": "tldr_en",
|
6 |
+
"rouge1_precision": 0.014929032882940911,
|
7 |
+
"dataset_path": "GEM/wiki_lingua",
|
8 |
+
"dataset_name": "en",
|
9 |
+
"subset": null,
|
10 |
+
"rouge1_precision_stderr": 0.001437252981426829
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "GEM/wiki_lingua_en",
|
14 |
+
"prompt_name": "tldr_en",
|
15 |
+
"rouge1_recall": 0.012605454696432882,
|
16 |
+
"dataset_path": "GEM/wiki_lingua",
|
17 |
+
"dataset_name": "en",
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_recall_stderr": 0.0012515184538477262
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "GEM/wiki_lingua_en",
|
23 |
+
"prompt_name": "tldr_en",
|
24 |
+
"rouge1_fmeasure": 0.01167038909001005,
|
25 |
+
"dataset_path": "GEM/wiki_lingua",
|
26 |
+
"dataset_name": "en",
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_fmeasure_stderr": 0.0010944129890059014
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "GEM/wiki_lingua_en",
|
32 |
+
"prompt_name": "tldr_en",
|
33 |
+
"rouge2_precision": 0.0041010832248453595,
|
34 |
+
"dataset_path": "GEM/wiki_lingua",
|
35 |
+
"dataset_name": "en",
|
36 |
+
"subset": null,
|
37 |
+
"rouge2_precision_stderr": 0.0005815077800226312
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "GEM/wiki_lingua_en",
|
41 |
+
"prompt_name": "tldr_en",
|
42 |
+
"rouge2_recall": 0.0034469110258534192,
|
43 |
+
"dataset_path": "GEM/wiki_lingua",
|
44 |
+
"dataset_name": "en",
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_recall_stderr": 0.0004725161504235943
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "GEM/wiki_lingua_en",
|
50 |
+
"prompt_name": "tldr_en",
|
51 |
+
"rouge2_fmeasure": 0.003203820995720566,
|
52 |
+
"dataset_path": "GEM/wiki_lingua",
|
53 |
+
"dataset_name": "en",
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_fmeasure_stderr": 0.00042103771194584703
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "GEM/wiki_lingua_en",
|
59 |
+
"prompt_name": "tldr_en",
|
60 |
+
"rougeL_precision": 0.011742235186401492,
|
61 |
+
"dataset_path": "GEM/wiki_lingua",
|
62 |
+
"dataset_name": "en",
|
63 |
+
"subset": null,
|
64 |
+
"rougeL_precision_stderr": 0.0011661092630625284
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "GEM/wiki_lingua_en",
|
68 |
+
"prompt_name": "tldr_en",
|
69 |
+
"rougeL_recall": 0.009919184883424567,
|
70 |
+
"dataset_path": "GEM/wiki_lingua",
|
71 |
+
"dataset_name": "en",
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_recall_stderr": 0.0010141555758155579
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "GEM/wiki_lingua_en",
|
77 |
+
"prompt_name": "tldr_en",
|
78 |
+
"rougeL_fmeasure": 0.009059396220908112,
|
79 |
+
"dataset_path": "GEM/wiki_lingua",
|
80 |
+
"dataset_name": "en",
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_fmeasure_stderr": 0.0008608509358819988
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "GEM/wiki_lingua_en",
|
86 |
+
"prompt_name": "tldr_en",
|
87 |
+
"rougeLsum_precision": 0.014255925532109718,
|
88 |
+
"dataset_path": "GEM/wiki_lingua",
|
89 |
+
"dataset_name": "en",
|
90 |
+
"subset": null,
|
91 |
+
"rougeLsum_precision_stderr": 0.001380098431266019
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "GEM/wiki_lingua_en",
|
95 |
+
"prompt_name": "tldr_en",
|
96 |
+
"rougeLsum_recall": 0.012021925711305772,
|
97 |
+
"dataset_path": "GEM/wiki_lingua",
|
98 |
+
"dataset_name": "en",
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_recall_stderr": 0.001202720217240228
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "GEM/wiki_lingua_en",
|
104 |
+
"prompt_name": "tldr_en",
|
105 |
+
"rougeLsum_fmeasure": 0.011096298465596519,
|
106 |
+
"dataset_path": "GEM/wiki_lingua",
|
107 |
+
"dataset_name": "en",
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.0010440702662319144
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "GEM/wiki_lingua_en",
|
113 |
+
"prompt_name": "tldr_en",
|
114 |
+
"bleu": 5.860100637257344e-12,
|
115 |
+
"dataset_path": "GEM/wiki_lingua",
|
116 |
+
"dataset_name": "en",
|
117 |
+
"subset": null,
|
118 |
+
"bleu_stderr": 1.0205550894986597e-10
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 5,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "gem_xsum",
|
5 |
+
"prompt_name": "article_DOC_summary",
|
6 |
+
"rouge1_precision": 0.21592444335846092,
|
7 |
+
"dataset_path": "GEM/xsum",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": "",
|
10 |
+
"rouge1_precision_stderr": 0.004246983520225218
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "gem_xsum",
|
14 |
+
"prompt_name": "article_DOC_summary",
|
15 |
+
"rouge1_recall": 0.27255737663479357,
|
16 |
+
"dataset_path": "GEM/xsum",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": "",
|
19 |
+
"rouge1_recall_stderr": 0.00447437166346409
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "gem_xsum",
|
23 |
+
"prompt_name": "article_DOC_summary",
|
24 |
+
"rouge1_fmeasure": 0.22044943741927525,
|
25 |
+
"dataset_path": "GEM/xsum",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": "",
|
28 |
+
"rouge1_fmeasure_stderr": 0.003500198780987625
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "gem_xsum",
|
32 |
+
"prompt_name": "article_DOC_summary",
|
33 |
+
"rouge2_precision": 0.05330214126583765,
|
34 |
+
"dataset_path": "GEM/xsum",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": "",
|
37 |
+
"rouge2_precision_stderr": 0.0025012370800294183
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "gem_xsum",
|
41 |
+
"prompt_name": "article_DOC_summary",
|
42 |
+
"rouge2_recall": 0.06515904722605939,
|
43 |
+
"dataset_path": "GEM/xsum",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": "",
|
46 |
+
"rouge2_recall_stderr": 0.0025535061286592504
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "gem_xsum",
|
50 |
+
"prompt_name": "article_DOC_summary",
|
51 |
+
"rouge2_fmeasure": 0.05351615652078515,
|
52 |
+
"dataset_path": "GEM/xsum",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": "",
|
55 |
+
"rouge2_fmeasure_stderr": 0.0022197484992897607
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "gem_xsum",
|
59 |
+
"prompt_name": "article_DOC_summary",
|
60 |
+
"rougeL_precision": 0.16172792138960806,
|
61 |
+
"dataset_path": "GEM/xsum",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": "",
|
64 |
+
"rougeL_precision_stderr": 0.003474612361597086
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "gem_xsum",
|
68 |
+
"prompt_name": "article_DOC_summary",
|
69 |
+
"rougeL_recall": 0.20248110828398325,
|
70 |
+
"dataset_path": "GEM/xsum",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": "",
|
73 |
+
"rougeL_recall_stderr": 0.0034801799909718704
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "gem_xsum",
|
77 |
+
"prompt_name": "article_DOC_summary",
|
78 |
+
"rougeL_fmeasure": 0.16421475353571632,
|
79 |
+
"dataset_path": "GEM/xsum",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": "",
|
82 |
+
"rougeL_fmeasure_stderr": 0.0028136325952485735
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "gem_xsum",
|
86 |
+
"prompt_name": "article_DOC_summary",
|
87 |
+
"rougeLsum_precision": 0.16634670992387487,
|
88 |
+
"dataset_path": "GEM/xsum",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": "",
|
91 |
+
"rougeLsum_precision_stderr": 0.0034675307885691244
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "gem_xsum",
|
95 |
+
"prompt_name": "article_DOC_summary",
|
96 |
+
"rougeLsum_recall": 0.21304752670955557,
|
97 |
+
"dataset_path": "GEM/xsum",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": "",
|
100 |
+
"rougeLsum_recall_stderr": 0.003802772338811888
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "gem_xsum",
|
104 |
+
"prompt_name": "article_DOC_summary",
|
105 |
+
"rougeLsum_fmeasure": 0.17052032174118803,
|
106 |
+
"dataset_path": "GEM/xsum",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": "",
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.002883605135295133
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "gem_xsum",
|
113 |
+
"prompt_name": "article_DOC_summary",
|
114 |
+
"bleu": 2.4283658434676942,
|
115 |
+
"dataset_path": "GEM/xsum",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": "",
|
118 |
+
"bleu_stderr": 0.1084231496409544
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 3,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "gem_xsum",
|
5 |
+
"prompt_name": "article_DOC_summary",
|
6 |
+
"rouge1_precision": 0.06147088600466444,
|
7 |
+
"dataset_path": "GEM/xsum",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": "",
|
10 |
+
"rouge1_precision_stderr": 0.003807688121301648
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "gem_xsum",
|
14 |
+
"prompt_name": "article_DOC_summary",
|
15 |
+
"rouge1_recall": 0.06919835565602203,
|
16 |
+
"dataset_path": "GEM/xsum",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": "",
|
19 |
+
"rouge1_recall_stderr": 0.004080878740872531
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "gem_xsum",
|
23 |
+
"prompt_name": "article_DOC_summary",
|
24 |
+
"rouge1_fmeasure": 0.05798370899016291,
|
25 |
+
"dataset_path": "GEM/xsum",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": "",
|
28 |
+
"rouge1_fmeasure_stderr": 0.003309673469340258
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "gem_xsum",
|
32 |
+
"prompt_name": "article_DOC_summary",
|
33 |
+
"rouge2_precision": 0.014228533929974868,
|
34 |
+
"dataset_path": "GEM/xsum",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": "",
|
37 |
+
"rouge2_precision_stderr": 0.0014047860954449383
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "gem_xsum",
|
41 |
+
"prompt_name": "article_DOC_summary",
|
42 |
+
"rouge2_recall": 0.0166314025722418,
|
43 |
+
"dataset_path": "GEM/xsum",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": "",
|
46 |
+
"rouge2_recall_stderr": 0.0014602317545186565
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "gem_xsum",
|
50 |
+
"prompt_name": "article_DOC_summary",
|
51 |
+
"rouge2_fmeasure": 0.013641962174467587,
|
52 |
+
"dataset_path": "GEM/xsum",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": "",
|
55 |
+
"rouge2_fmeasure_stderr": 0.0012106481387976049
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "gem_xsum",
|
59 |
+
"prompt_name": "article_DOC_summary",
|
60 |
+
"rougeL_precision": 0.04649724897774529,
|
61 |
+
"dataset_path": "GEM/xsum",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": "",
|
64 |
+
"rougeL_precision_stderr": 0.0029768758298026627
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "gem_xsum",
|
68 |
+
"prompt_name": "article_DOC_summary",
|
69 |
+
"rougeL_recall": 0.05168291535676844,
|
70 |
+
"dataset_path": "GEM/xsum",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": "",
|
73 |
+
"rougeL_recall_stderr": 0.003094715210052394
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "gem_xsum",
|
77 |
+
"prompt_name": "article_DOC_summary",
|
78 |
+
"rougeL_fmeasure": 0.043392440237132235,
|
79 |
+
"dataset_path": "GEM/xsum",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": "",
|
82 |
+
"rougeL_fmeasure_stderr": 0.002526920635642375
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "gem_xsum",
|
86 |
+
"prompt_name": "article_DOC_summary",
|
87 |
+
"rougeLsum_precision": 0.048035895047127686,
|
88 |
+
"dataset_path": "GEM/xsum",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": "",
|
91 |
+
"rougeLsum_precision_stderr": 0.003028118969042768
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "gem_xsum",
|
95 |
+
"prompt_name": "article_DOC_summary",
|
96 |
+
"rougeLsum_recall": 0.05464177271239397,
|
97 |
+
"dataset_path": "GEM/xsum",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": "",
|
100 |
+
"rougeLsum_recall_stderr": 0.0033104672738008553
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "gem_xsum",
|
104 |
+
"prompt_name": "article_DOC_summary",
|
105 |
+
"rougeLsum_fmeasure": 0.04537985187674532,
|
106 |
+
"dataset_path": "GEM/xsum",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": "",
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.0026361408489140024
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "gem_xsum",
|
113 |
+
"prompt_name": "article_DOC_summary",
|
114 |
+
"bleu": 0.40228156399700404,
|
115 |
+
"dataset_path": "GEM/xsum",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": "",
|
118 |
+
"bleu_stderr": 0.0795066489605694
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 4,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "gem_xsum",
|
5 |
+
"prompt_name": "article_DOC_summary",
|
6 |
+
"rouge1_precision": 0.002305114359249611,
|
7 |
+
"dataset_path": "GEM/xsum",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": "",
|
10 |
+
"rouge1_precision_stderr": 0.0006855365150103617
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "gem_xsum",
|
14 |
+
"prompt_name": "article_DOC_summary",
|
15 |
+
"rouge1_recall": 0.002688750909485673,
|
16 |
+
"dataset_path": "GEM/xsum",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": "",
|
19 |
+
"rouge1_recall_stderr": 0.0008419301936781596
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "gem_xsum",
|
23 |
+
"prompt_name": "article_DOC_summary",
|
24 |
+
"rouge1_fmeasure": 0.0022410464211047414,
|
25 |
+
"dataset_path": "GEM/xsum",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": "",
|
28 |
+
"rouge1_fmeasure_stderr": 0.0006379815102748624
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "gem_xsum",
|
32 |
+
"prompt_name": "article_DOC_summary",
|
33 |
+
"rouge2_precision": 0.00023662681974673803,
|
34 |
+
"dataset_path": "GEM/xsum",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": "",
|
37 |
+
"rouge2_precision_stderr": 0.00010805221445763836
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "gem_xsum",
|
41 |
+
"prompt_name": "article_DOC_summary",
|
42 |
+
"rouge2_recall": 0.0003191847290509596,
|
43 |
+
"dataset_path": "GEM/xsum",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": "",
|
46 |
+
"rouge2_recall_stderr": 0.00013624996945101362
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "gem_xsum",
|
50 |
+
"prompt_name": "article_DOC_summary",
|
51 |
+
"rouge2_fmeasure": 0.0002428157678792438,
|
52 |
+
"dataset_path": "GEM/xsum",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": "",
|
55 |
+
"rouge2_fmeasure_stderr": 0.00010121603253679161
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "gem_xsum",
|
59 |
+
"prompt_name": "article_DOC_summary",
|
60 |
+
"rougeL_precision": 0.0014967981731443785,
|
61 |
+
"dataset_path": "GEM/xsum",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": "",
|
64 |
+
"rougeL_precision_stderr": 0.00044282917196946015
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "gem_xsum",
|
68 |
+
"prompt_name": "article_DOC_summary",
|
69 |
+
"rougeL_recall": 0.0018089244853048499,
|
70 |
+
"dataset_path": "GEM/xsum",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": "",
|
73 |
+
"rougeL_recall_stderr": 0.0005955143349649119
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "gem_xsum",
|
77 |
+
"prompt_name": "article_DOC_summary",
|
78 |
+
"rougeL_fmeasure": 0.001480383954453164,
|
79 |
+
"dataset_path": "GEM/xsum",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": "",
|
82 |
+
"rougeL_fmeasure_stderr": 0.0004335775279500401
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "gem_xsum",
|
86 |
+
"prompt_name": "article_DOC_summary",
|
87 |
+
"rougeLsum_precision": 0.0014611359517881007,
|
88 |
+
"dataset_path": "GEM/xsum",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": "",
|
91 |
+
"rougeLsum_precision_stderr": 0.0004268628859757995
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "gem_xsum",
|
95 |
+
"prompt_name": "article_DOC_summary",
|
96 |
+
"rougeLsum_recall": 0.0017987055420995693,
|
97 |
+
"dataset_path": "GEM/xsum",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": "",
|
100 |
+
"rougeLsum_recall_stderr": 0.0005977174922462606
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "gem_xsum",
|
104 |
+
"prompt_name": "article_DOC_summary",
|
105 |
+
"rougeLsum_fmeasure": 0.0014592568247182058,
|
106 |
+
"dataset_path": "GEM/xsum",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": "",
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.00042833093506174205
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "gem_xsum",
|
113 |
+
"prompt_name": "article_DOC_summary",
|
114 |
+
"bleu": 1.2194816597556972e-29,
|
115 |
+
"dataset_path": "GEM/xsum",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": "",
|
118 |
+
"bleu_stderr": 1.9673150803997765e-21
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 5,
|
126 |
+
"batch_size": 8,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.354,0.015129868238451773,0
|
3 |
+
anli_r2,acc,0.331,0.01488827258820394,0
|
4 |
+
anli_r3,acc,0.3458333333333333,0.01373624534231101,0
|
5 |
+
arc_challenge,acc,0.2721843003412969,0.013006600406423707,0
|
6 |
+
arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0
|
7 |
+
arc_easy,acc,0.5707070707070707,0.010156678075911087,0
|
8 |
+
arc_easy,acc_norm,0.5172558922558923,0.010253671674754631,0
|
9 |
+
boolq,acc,0.5501529051987768,0.008700950643028801,1
|
10 |
+
cb,acc,0.2857142857142857,0.06091449038731724,1
|
11 |
+
cb,f1,0.30952380952380953,,1
|
12 |
+
copa,acc,0.7,0.046056618647183814,0
|
13 |
+
hellaswag,acc,0.4360685122485561,0.004948824501355485,0
|
14 |
+
hellaswag,acc_norm,0.5632344154550887,0.004949716368890496,0
|
15 |
+
piqa,acc,0.7225244831338411,0.010446818281039959,0
|
16 |
+
piqa,acc_norm,0.7317736670293797,0.010336761992404485,0
|
17 |
+
rte,acc,0.5306859205776173,0.03003973059219781,0
|
18 |
+
sciq,acc,0.848,0.011358918303475282,0
|
19 |
+
sciq,acc_norm,0.758,0.013550631705555958,0
|
20 |
+
storycloze_2016,acc,0.6969535008017104,0.010627613073376715,0
|
21 |
+
winogrande,acc,0.5666929755327546,0.013926915052757347,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.354,
|
5 |
-
"acc_stderr": 0.015129868238451773
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.331,
|
9 |
-
"acc_stderr": 0.01488827258820394
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3458333333333333,
|
13 |
-
"acc_stderr": 0.01373624534231101
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.2857142857142857,
|
17 |
-
"acc_stderr": 0.06091449038731724,
|
18 |
-
"f1": 0.30952380952380953
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.7,
|
22 |
-
"acc_stderr": 0.046056618647183814
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4360685122485561,
|
26 |
-
"acc_stderr": 0.004948824501355485,
|
27 |
-
"acc_norm": 0.5632344154550887,
|
28 |
-
"acc_norm_stderr": 0.004949716368890496
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5306859205776173,
|
32 |
-
"acc_stderr": 0.03003973059219781
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5666929755327546,
|
36 |
-
"acc_stderr": 0.013926915052757347
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6969535008017104,
|
40 |
-
"acc_stderr": 0.010627613073376715
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5501529051987768,
|
44 |
-
"acc_stderr": 0.008700950643028801
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5707070707070707,
|
48 |
-
"acc_stderr": 0.010156678075911087,
|
49 |
-
"acc_norm": 0.5172558922558923,
|
50 |
-
"acc_norm_stderr": 0.010253671674754631
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2721843003412969,
|
54 |
-
"acc_stderr": 0.013006600406423707,
|
55 |
-
"acc_norm": 0.2832764505119454,
|
56 |
-
"acc_norm_stderr": 0.013167478735134575
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.848,
|
60 |
-
"acc_stderr": 0.011358918303475282,
|
61 |
-
"acc_norm": 0.758,
|
62 |
-
"acc_norm_stderr": 0.013550631705555958
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7225244831338411,
|
66 |
-
"acc_stderr": 0.010446818281039959,
|
67 |
-
"acc_norm": 0.7317736670293797,
|
68 |
-
"acc_norm_stderr": 0.010336761992404485
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.34,0.014987482264363937,0
|
3 |
+
anli_r2,acc,0.326,0.014830507204541028,0
|
4 |
+
anli_r3,acc,0.3541666666666667,0.01381193349957096,0
|
5 |
+
arc_challenge,acc,0.27474402730375425,0.013044617212771227,0
|
6 |
+
arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0
|
7 |
+
arc_easy,acc,0.5968013468013468,0.010065668576794803,0
|
8 |
+
arc_easy,acc_norm,0.5913299663299664,0.01008717449876288,0
|
9 |
+
boolq,acc,0.5562691131498471,0.008689501105367413,1
|
10 |
+
cb,acc,0.42857142857142855,0.06672848092813058,1
|
11 |
+
cb,f1,0.36324786324786323,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.4411471818362876,0.004955095096264714,0
|
14 |
+
hellaswag,acc_norm,0.5774746066520613,0.004929517011508216,0
|
15 |
+
piqa,acc,0.7295973884657236,0.010363167031620784,0
|
16 |
+
piqa,acc_norm,0.7334058759521219,0.010316749863541365,0
|
17 |
+
rte,acc,0.5234657039711191,0.030063300411902652,0
|
18 |
+
sciq,acc,0.887,0.010016552866696846,0
|
19 |
+
sciq,acc_norm,0.882,0.01020686926438179,0
|
20 |
+
storycloze_2016,acc,0.6830571886691609,0.010759650951452121,0
|
21 |
+
winogrande,acc,0.5595895816890292,0.013952330311915603,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.34,
|
5 |
-
"acc_stderr": 0.014987482264363937
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.326,
|
9 |
-
"acc_stderr": 0.014830507204541028
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3541666666666667,
|
13 |
-
"acc_stderr": 0.01381193349957096
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.36324786324786323
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4411471818362876,
|
26 |
-
"acc_stderr": 0.004955095096264714,
|
27 |
-
"acc_norm": 0.5774746066520613,
|
28 |
-
"acc_norm_stderr": 0.004929517011508216
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5234657039711191,
|
32 |
-
"acc_stderr": 0.030063300411902652
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5595895816890292,
|
36 |
-
"acc_stderr": 0.013952330311915603
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6830571886691609,
|
40 |
-
"acc_stderr": 0.010759650951452121
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5562691131498471,
|
44 |
-
"acc_stderr": 0.008689501105367413
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5968013468013468,
|
48 |
-
"acc_stderr": 0.010065668576794803,
|
49 |
-
"acc_norm": 0.5913299663299664,
|
50 |
-
"acc_norm_stderr": 0.01008717449876288
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.27474402730375425,
|
54 |
-
"acc_stderr": 0.013044617212771227,
|
55 |
-
"acc_norm": 0.3037542662116041,
|
56 |
-
"acc_norm_stderr": 0.01343890918477876
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.887,
|
60 |
-
"acc_stderr": 0.010016552866696846,
|
61 |
-
"acc_norm": 0.882,
|
62 |
-
"acc_norm_stderr": 0.01020686926438179
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7295973884657236,
|
66 |
-
"acc_stderr": 0.010363167031620784,
|
67 |
-
"acc_norm": 0.7334058759521219,
|
68 |
-
"acc_norm_stderr": 0.010316749863541365
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.343,0.015019206922356951,0
|
3 |
+
anli_r2,acc,0.339,0.014976758771620349,0
|
4 |
+
anli_r3,acc,0.33416666666666667,0.013622434813136774,0
|
5 |
+
arc_challenge,acc,0.28924914675767915,0.013250012579393443,0
|
6 |
+
arc_challenge,acc_norm,0.310580204778157,0.013522292098053059,0
|
7 |
+
arc_easy,acc,0.6077441077441077,0.010018744689650043,0
|
8 |
+
arc_easy,acc_norm,0.6026936026936027,0.010041053078884286,0
|
9 |
+
boolq,acc,0.5529051987767584,0.008695963064172717,1
|
10 |
+
cb,acc,0.4107142857142857,0.0663363415035954,1
|
11 |
+
cb,f1,0.30617283950617286,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.4419438358892651,0.004956030970911519,0
|
14 |
+
hellaswag,acc_norm,0.5717984465245967,0.004938068627349502,0
|
15 |
+
piqa,acc,0.7295973884657236,0.010363167031620784,0
|
16 |
+
piqa,acc_norm,0.735038084874864,0.010296557993316042,0
|
17 |
+
rte,acc,0.4404332129963899,0.029882123363118726,0
|
18 |
+
sciq,acc,0.914,0.008870325962594766,0
|
19 |
+
sciq,acc_norm,0.908,0.009144376393151108,0
|
20 |
+
storycloze_2016,acc,0.6862640299305185,0.01073017911931762,0
|
21 |
+
winogrande,acc,0.5382794001578532,0.014011242594964115,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.343,
|
5 |
-
"acc_stderr": 0.015019206922356951
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.339,
|
9 |
-
"acc_stderr": 0.014976758771620349
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33416666666666667,
|
13 |
-
"acc_stderr": 0.013622434813136774
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4107142857142857,
|
17 |
-
"acc_stderr": 0.0663363415035954,
|
18 |
-
"f1": 0.30617283950617286
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4419438358892651,
|
26 |
-
"acc_stderr": 0.004956030970911519,
|
27 |
-
"acc_norm": 0.5717984465245967,
|
28 |
-
"acc_norm_stderr": 0.004938068627349502
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4404332129963899,
|
32 |
-
"acc_stderr": 0.029882123363118726
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5382794001578532,
|
36 |
-
"acc_stderr": 0.014011242594964115
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6862640299305185,
|
40 |
-
"acc_stderr": 0.01073017911931762
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5529051987767584,
|
44 |
-
"acc_stderr": 0.008695963064172717
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6077441077441077,
|
48 |
-
"acc_stderr": 0.010018744689650043,
|
49 |
-
"acc_norm": 0.6026936026936027,
|
50 |
-
"acc_norm_stderr": 0.010041053078884286
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.28924914675767915,
|
54 |
-
"acc_stderr": 0.013250012579393443,
|
55 |
-
"acc_norm": 0.310580204778157,
|
56 |
-
"acc_norm_stderr": 0.013522292098053059
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.914,
|
60 |
-
"acc_stderr": 0.008870325962594766,
|
61 |
-
"acc_norm": 0.908,
|
62 |
-
"acc_norm_stderr": 0.009144376393151108
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7295973884657236,
|
66 |
-
"acc_stderr": 0.010363167031620784,
|
67 |
-
"acc_norm": 0.735038084874864,
|
68 |
-
"acc_norm_stderr": 0.010296557993316042
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.344,0.015029633724408945,0
|
3 |
+
anli_r2,acc,0.365,0.0152317762262649,0
|
4 |
+
anli_r3,acc,0.3333333333333333,0.013613950010225612,0
|
5 |
+
arc_challenge,acc,0.2858361774744027,0.013203196088537369,0
|
6 |
+
arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0
|
7 |
+
arc_easy,acc,0.6077441077441077,0.010018744689650043,0
|
8 |
+
arc_easy,acc_norm,0.6022727272727273,0.010042861602178056,0
|
9 |
+
boolq,acc,0.5314984709480123,0.00872768484861531,1
|
10 |
+
cb,acc,0.44642857142857145,0.06703189227942398,1
|
11 |
+
cb,f1,0.428030303030303,,1
|
12 |
+
copa,acc,0.74,0.04408440022768079,0
|
13 |
+
hellaswag,acc,0.4431388169687313,0.004957410545559414,0
|
14 |
+
hellaswag,acc_norm,0.58105954989046,0.004923772581848488,0
|
15 |
+
piqa,acc,0.7323177366702938,0.010330111189370429,0
|
16 |
+
piqa,acc_norm,0.735038084874864,0.010296557993316044,0
|
17 |
+
rte,acc,0.48736462093862815,0.030086851767188564,0
|
18 |
+
sciq,acc,0.913,0.008916866630745923,0
|
19 |
+
sciq,acc_norm,0.911,0.009008893392651518,0
|
20 |
+
storycloze_2016,acc,0.6916087653661144,0.010679734445487801,0
|
21 |
+
winogrande,acc,0.5556432517758485,0.013965196769083555,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.344,
|
5 |
-
"acc_stderr": 0.015029633724408945
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.365,
|
9 |
-
"acc_stderr": 0.0152317762262649
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3333333333333333,
|
13 |
-
"acc_stderr": 0.013613950010225612
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.44642857142857145,
|
17 |
-
"acc_stderr": 0.06703189227942398,
|
18 |
-
"f1": 0.428030303030303
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.74,
|
22 |
-
"acc_stderr": 0.04408440022768079
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4431388169687313,
|
26 |
-
"acc_stderr": 0.004957410545559414,
|
27 |
-
"acc_norm": 0.58105954989046,
|
28 |
-
"acc_norm_stderr": 0.004923772581848488
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48736462093862815,
|
32 |
-
"acc_stderr": 0.030086851767188564
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5556432517758485,
|
36 |
-
"acc_stderr": 0.013965196769083555
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6916087653661144,
|
40 |
-
"acc_stderr": 0.010679734445487801
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5314984709480123,
|
44 |
-
"acc_stderr": 0.00872768484861531
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6077441077441077,
|
48 |
-
"acc_stderr": 0.010018744689650043,
|
49 |
-
"acc_norm": 0.6022727272727273,
|
50 |
-
"acc_norm_stderr": 0.010042861602178056
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2858361774744027,
|
54 |
-
"acc_stderr": 0.013203196088537369,
|
55 |
-
"acc_norm": 0.3037542662116041,
|
56 |
-
"acc_norm_stderr": 0.01343890918477876
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.913,
|
60 |
-
"acc_stderr": 0.008916866630745923,
|
61 |
-
"acc_norm": 0.911,
|
62 |
-
"acc_norm_stderr": 0.009008893392651518
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7323177366702938,
|
66 |
-
"acc_stderr": 0.010330111189370429,
|
67 |
-
"acc_norm": 0.735038084874864,
|
68 |
-
"acc_norm_stderr": 0.010296557993316044
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.363,0.015213890444671283,0
|
3 |
+
anli_r2,acc,0.362,0.0152048409129195,0
|
4 |
+
anli_r3,acc,0.3516666666666667,0.013789711695404806,0
|
5 |
+
arc_challenge,acc,0.27559726962457337,0.013057169655761838,0
|
6 |
+
arc_challenge,acc_norm,0.31313993174061433,0.013552671543623501,0
|
7 |
+
arc_easy,acc,0.6203703703703703,0.009958037725468565,0
|
8 |
+
arc_easy,acc_norm,0.6085858585858586,0.010014917532627824,0
|
9 |
+
boolq,acc,0.5162079510703363,0.008740459157499082,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.3340305010893247,,1
|
12 |
+
copa,acc,0.74,0.04408440022768078,0
|
13 |
+
hellaswag,acc,0.44064927305317664,0.004954503606471609,0
|
14 |
+
hellaswag,acc_norm,0.5764787890858395,0.004931065434173691,0
|
15 |
+
piqa,acc,0.7285092491838956,0.010376251176596135,0
|
16 |
+
piqa,acc_norm,0.7393906420021763,0.010241826155811632,0
|
17 |
+
rte,acc,0.44765342960288806,0.029931070362939526,0
|
18 |
+
sciq,acc,0.91,0.009054390204866444,0
|
19 |
+
sciq,acc_norm,0.914,0.008870325962594766,0
|
20 |
+
storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0
|
21 |
+
winogrande,acc,0.5501183898973955,0.013981711904049732,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-07-32_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.363,
|
5 |
-
"acc_stderr": 0.015213890444671283
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.362,
|
9 |
-
"acc_stderr": 0.0152048409129195
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3516666666666667,
|
13 |
-
"acc_stderr": 0.013789711695404806
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.3340305010893247
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.74,
|
22 |
-
"acc_stderr": 0.04408440022768078
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.44064927305317664,
|
26 |
-
"acc_stderr": 0.004954503606471609,
|
27 |
-
"acc_norm": 0.5764787890858395,
|
28 |
-
"acc_norm_stderr": 0.004931065434173691
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.44765342960288806,
|
32 |
-
"acc_stderr": 0.029931070362939526
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5501183898973955,
|
36 |
-
"acc_stderr": 0.013981711904049732
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6932121859967931,
|
40 |
-
"acc_stderr": 0.010664275190473634
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5162079510703363,
|
44 |
-
"acc_stderr": 0.008740459157499082
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6203703703703703,
|
48 |
-
"acc_stderr": 0.009958037725468565,
|
49 |
-
"acc_norm": 0.6085858585858586,
|
50 |
-
"acc_norm_stderr": 0.010014917532627824
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.27559726962457337,
|
54 |
-
"acc_stderr": 0.013057169655761838,
|
55 |
-
"acc_norm": 0.31313993174061433,
|
56 |
-
"acc_norm_stderr": 0.013552671543623501
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.91,
|
60 |
-
"acc_stderr": 0.009054390204866444,
|
61 |
-
"acc_norm": 0.914,
|
62 |
-
"acc_norm_stderr": 0.008870325962594766
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7285092491838956,
|
66 |
-
"acc_stderr": 0.010376251176596135,
|
67 |
-
"acc_norm": 0.7393906420021763,
|
68 |
-
"acc_norm_stderr": 0.010241826155811632
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.359,0.015177264224798601,0
|
3 |
+
anli_r2,acc,0.335,0.014933117490932573,0
|
4 |
+
anli_r3,acc,0.3258333333333333,0.013535422043417454,0
|
5 |
+
arc_challenge,acc,0.2832764505119454,0.013167478735134575,0
|
6 |
+
arc_challenge,acc_norm,0.3165529010238908,0.01359243151906808,0
|
7 |
+
arc_easy,acc,0.6094276094276094,0.010011059112064243,0
|
8 |
+
arc_easy,acc_norm,0.6119528619528619,0.009999295905750666,0
|
9 |
+
boolq,acc,0.519571865443425,0.008738352682962235,1
|
10 |
+
cb,acc,0.42857142857142855,0.06672848092813058,1
|
11 |
+
cb,f1,0.38723751912112364,,1
|
12 |
+
copa,acc,0.78,0.04163331998932262,0
|
13 |
+
hellaswag,acc,0.44343756223859787,0.0049577508971529426,0
|
14 |
+
hellaswag,acc_norm,0.5806612228639714,0.004924424018073683,0
|
15 |
+
piqa,acc,0.7247007616974973,0.01042142927736953,0
|
16 |
+
piqa,acc_norm,0.7393906420021763,0.010241826155811632,0
|
17 |
+
rte,acc,0.48014440433212996,0.0300727231673172,0
|
18 |
+
sciq,acc,0.913,0.008916866630745925,0
|
19 |
+
sciq,acc_norm,0.917,0.00872852720607479,0
|
20 |
+
storycloze_2016,acc,0.6937466595403528,0.010659088460112754,0
|
21 |
+
winogrande,acc,0.5540647198105761,0.013970093482330697,0
|
8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-37_5shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.359,
|
5 |
-
"acc_stderr": 0.015177264224798601
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.335,
|
9 |
-
"acc_stderr": 0.014933117490932573
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3258333333333333,
|
13 |
-
"acc_stderr": 0.013535422043417454
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.38723751912112364
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.78,
|
22 |
-
"acc_stderr": 0.04163331998932262
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.44343756223859787,
|
26 |
-
"acc_stderr": 0.0049577508971529426,
|
27 |
-
"acc_norm": 0.5806612228639714,
|
28 |
-
"acc_norm_stderr": 0.004924424018073683
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48014440433212996,
|
32 |
-
"acc_stderr": 0.0300727231673172
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5540647198105761,
|
36 |
-
"acc_stderr": 0.013970093482330697
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6937466595403528,
|
40 |
-
"acc_stderr": 0.010659088460112754
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.519571865443425,
|
44 |
-
"acc_stderr": 0.008738352682962235
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6094276094276094,
|
48 |
-
"acc_stderr": 0.010011059112064243,
|
49 |
-
"acc_norm": 0.6119528619528619,
|
50 |
-
"acc_norm_stderr": 0.009999295905750666
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2832764505119454,
|
54 |
-
"acc_stderr": 0.013167478735134575,
|
55 |
-
"acc_norm": 0.3165529010238908,
|
56 |
-
"acc_norm_stderr": 0.01359243151906808
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.913,
|
60 |
-
"acc_stderr": 0.008916866630745925,
|
61 |
-
"acc_norm": 0.917,
|
62 |
-
"acc_norm_stderr": 0.00872852720607479
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7247007616974973,
|
66 |
-
"acc_stderr": 0.01042142927736953,
|
67 |
-
"acc_norm": 0.7393906420021763,
|
68 |
-
"acc_norm_stderr": 0.010241826155811632
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b178b/evaluation/8b7178b178b_1_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.10766666666666666,
|
5 |
+
"em_stderr": 0.005659993848227298
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 1,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b178b/evaluation/8b7178b178b_2_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.208,
|
5 |
+
"em_stderr": 0.007411498505927842
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 2,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b178b/evaluation/8b7178b178b_3_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.2713333333333333,
|
5 |
+
"em_stderr": 0.008119472096605799
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 3,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b178b/evaluation/8b7178b178b_4_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.309,
|
5 |
+
"em_stderr": 0.008437815608561314
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 4,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b178b/evaluation/8b7178b178b_5_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.3273333333333333,
|
5 |
+
"em_stderr": 0.008568540173271721
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
|
14 |
+
"num_fewshot": 5,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b25b/evaluation/8b7178b25b_0_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.0,
|
5 |
+
"em_stderr": 0.0
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
|
14 |
+
"num_fewshot": 0,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b25b/evaluation/8b7178b25b_1_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.118,
|
5 |
+
"em_stderr": 0.005890973421765812
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
|
14 |
+
"num_fewshot": 1,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b25b/evaluation/8b7178b25b_2_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.24333333333333335,
|
5 |
+
"em_stderr": 0.007835466732772215
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
|
14 |
+
"num_fewshot": 2,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b25b/evaluation/8b7178b25b_3_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.2833333333333333,
|
5 |
+
"em_stderr": 0.008228472181192749
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
|
14 |
+
"num_fewshot": 3,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
8b7178b25b/evaluation/8b7178b25b_4_babi.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"babi": {
|
4 |
+
"em": 0.30766666666666664,
|
5 |
+
"em_stderr": 0.008427710547037915
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"babi": 0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "gpt2",
|
13 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
|
14 |
+
"num_fewshot": 4,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": null,
|
17 |
+
"no_cache": true,
|
18 |
+
"limit": 3000,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|