Muennighoff commited on
Commit
16a575b
1 Parent(s): a915ab8
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json +1 -0
  2. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json +1 -0
  3. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json +1 -0
  4. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json +1 -0
  5. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json +1 -0
  6. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json +1 -0
  7. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json +1 -0
  8. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json +1 -0
  9. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  10. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  11. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
  12. evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
  13. evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json +1 -0
  14. evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json +1 -0
  15. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl +3 -0
  16. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl +3 -0
  17. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl +3 -0
  18. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl +3 -0
  19. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl +3 -0
  20. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl +3 -0
  21. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl +3 -0
  22. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl +3 -0
  23. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl +3 -0
  24. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl +3 -0
  25. evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
  26. evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl +3 -0
  27. evaluation/generation/merged.csv +53 -0
  28. evaluation/generation/merged.json +1 -0
  29. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json +133 -0
  30. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json +133 -0
  31. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json +133 -0
  32. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json +133 -0
  33. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json +133 -0
  34. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json +133 -0
  35. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json +133 -0
  36. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json +133 -0
  37. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
  38. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
  39. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
  40. evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
  41. evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json +133 -0
  42. evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json +133 -0
  43. evaluation/rankeval/lm1-2b8-55b-oscarroots_0.csv +21 -0
  44. evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json +56 -1
  45. evaluation/rankeval/lm1-2b8-55b-oscarroots_0_lm-eval_global_step52452_2023-02-25-11-16-27_0shots_backup.json +0 -32
  46. evaluation/rankeval/lm1-2b8-55b-oscarroots_1.csv +21 -0
  47. evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json +56 -1
  48. evaluation/rankeval/lm1-2b8-55b-oscarroots_1_lm-eval_global_step52452_2023-02-25-11-18-29_1shots_backup.json +0 -32
  49. evaluation/rankeval/lm1-2b8-55b-oscarroots_2.csv +21 -0
  50. evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json +56 -1
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.2562657244023514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.028344931968085282}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.062384701456087446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020959572066061704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.2778238258177499, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0053586090146941585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.08917647320322222, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001969587935532424}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.026957871325524504, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00119791534895436}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.12120656565208492, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003245668709085193}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.03885973727074715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011411233576479184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.059948665447718395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001993918360545074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2696777384075072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005223094399514082}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.08588431201368854, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018539010477647254}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.05914169795430463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020593373696654797}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.25838316242026105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00491849430291812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.08392617035294098, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001881634203344931}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.44906636509622533, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.039450588630595015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.11288061517490074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00386994874457578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.27968139569340955, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004818798730073509}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.13316786961188862, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032035615287408364}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.052903771126418274, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024560083140397187}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.13670929059101208, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032752111527625427}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.0635687916821055, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020746952988998694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.10294903334837356, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034785078085129637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.26443193955147987, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004508721603501755}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.12246686808714284, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027865982212275067}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.1043770278231783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035484629358203635}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2648398336113232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004503000799632382}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.12355055054182587, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002841014595626855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4687852352128821, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037809750600987985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.12939788800380767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004056238000462917}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3205721940394599, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004784451352989951}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.15485025651871712, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034302178802564512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.06272684866048131, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002451752863709562}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16040449198937512, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003451151403396264}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07545945954698206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021937365311268525}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.11538953087200472, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00346589668028597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.30175228398138576, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004432821529779081}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.14062442945039813, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028845693981807787}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.11767324801231895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035688465184641432}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3033452610052125, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004447592173686305}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.14250354572951518, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002959772587239683}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5387522714694768, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.023700066593108533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.14028377513050538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004341036446200867}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.33808480225227105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004840897586860907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1660401631118107, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037496181550606754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.07249658632299814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029199126351436076}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1738659001719329, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003647562909324833}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.08448456167004659, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002619780755266116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.12529450659786737, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003740919506421422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3182618579312013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004525730253384199}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.15093871662748365, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0032225789949202046}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.12866365606890057, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003897701131855631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3209266755398485, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004550196873941636}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.15379137375695537, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0033296076273151513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6154728892401495, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05492052153457257}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.1416216780254008, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004403560474088744}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.350369181874358, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004821260815960307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.16918194985742865, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003726050287797417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.07377427634131609, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029769913238928973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18193245830782417, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003657633632618109}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.08639654396535042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0025466510150414677}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.12691040102952247, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0038030608678551996}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.33016396642156515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004483672398798779}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.15419230913013102, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031966205747524113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.1301329876623986, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003962229410505262}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3326898264482343, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004514627367420683}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.15686161901707832, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003303387308503824}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6280080728696126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04207890973166607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.15509132478375567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004705843597864089}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.36072408074578627, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004932090632111422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1785781279253249, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003864561091208021}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.08180998073756966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0031797028106942695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1895120629191651, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037580265385154963}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.09234282351104058, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0026995310439716595}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.13813720543380648, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004053616600136467}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.33922496409575775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004622505715509403}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1620344936907299, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003324512248892316}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.14168764741861828, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004239364453934911}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.34078189394441194, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004625313010789047}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.16446961325308998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0034245269422962608}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.14831832571904127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020031574591708096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.23669379639579727, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002619263402445374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.16843326604027012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018787988149176356}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.027495279137204204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008008675778483577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04558631579219217, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013034624565288761}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03124497623150672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000826359540629872}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11944058601706234, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014613554843883904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.197662087047554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022098401272736727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1373698337101501, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014061340796650912}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.13662513471708013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001834224913870075}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21848967035350617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00242500324850602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15530559329563356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017277092690194619}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.4416513225274035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07150581722616807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2474989333475308, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004192775409890903}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.16589702996724112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002477684211999076}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1595930665634592, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020260063889951897}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06456439217534583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026036519415425876}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.03395563608028862, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011867665512589468}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03442090774378983, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011197085684525982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2007100384592662, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036402244436710527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.12968060330920403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019100623389590725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.12543184498335516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015786460435038327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2356687196157363, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004040097148447038}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.15692818247747958, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023149233432171795}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15113313197846198, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019006022698345565}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.9112541973999004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08163120654641273}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.3671552578777415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00464310641179873}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.18316424412100443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002534818661881543}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20013093911819188, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021449192871676984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.11652986149569292, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0033165278944766326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.048557963351484004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012897480876968325}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05444986552431719, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012835369876590637}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.3030100733810117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004160446544180668}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1454519391726235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020102539956042787}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1600577177279773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017194821563251488}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.34948494261958846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00452044298551405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.17241597783925564, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023681037127358745}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18899973850835908, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020281161914398288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.4920293719291493, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06820740373159184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.31853023306817924, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005101746577027814}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.14722792301810253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026467115286204013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.16433308908585867, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002402134739956357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.10260882729326853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0033488070271773295}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04037648707007591, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012615029724490608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04583954485462417, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012897030660356447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.26690611784240603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004530048771512807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.11874183155575303, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021251987841602225}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13364146877613045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00195553742839865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.3046492013868888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004949401564936648}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.1393491071946238, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024976587731928646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15587618164528091, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002273493914941302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.6055821821151304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06565825928935355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.10914047578759525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004168163234199949}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.04875414041264775, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020018963702338544}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05455257649733116, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002012404693346845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.037261877113724175, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002437336913811512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.013448412424643126, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008658300088105147}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01577378906241384, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009385589100462621}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.09325777757588789, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003680574837942309}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.03991105511182912, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016279952050045603}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.045100032510421204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016666399875103265}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.10390247551362326, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004006559051889402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.04597031630948619, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0018861168846922784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.051422475813845084, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018931617381586926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.006932560692974423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0017990239865701577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.019024182319545703, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019567530452185085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.00809080049652358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008930277662354756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.009250948371284507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009365636508956353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.007878162042298584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012119853940782482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0024981499124714754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00036249889299460995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0031062476201456015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00043823448085888057}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.016781942931569222, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017964368358031513}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.006703878314586593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007330971808896851}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.007784924334651645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007922903839285351}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.01855296243342967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001923182050974051}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.00783323994370169, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008689584836341853}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008943947340808445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009064353361518755}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.5486400435190646e-20, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.928974165985541e-18}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06536803730199292, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0040080446277792475}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0562578647719299, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003466710519052521}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05638135159346031, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033862796262482922}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.013453387419618108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014082914388032289}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.012484556808642255, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012859869239756115}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.01218356666574454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012325430366165824}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.049695294130091966, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031067522139507655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04191724945748147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026229390757436582}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.042197213021247744, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025684294584781297}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0501407378152416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00312422462775062}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04244935232602974, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026544789695008624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.042656324273577836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002590374442922286}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.114133025618453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03902166340342278}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da240232ef9803594f89287674f0f363b3ee085ab5ada3f7ba1fcf42c74238a9
3
+ size 4174292
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1792872b8360c67aa01618d992d596079f04524a1f1503f34e0c5a8ea05d498
3
+ size 4762910
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b9f81e6fa3c44ef15ccfd5e9a1b3a9c9e0ca5f9d010e2e5a1f113c7e000edae
3
+ size 5711084
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6815796d6e2d668597f9433d5131bd8c3111fe44288ea40c86c6e7ce22c2c5a
3
+ size 6596539
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bfb2896e5849621f3f7a89e6828f03535322465ea21e99c1dc62eee20af2496
3
+ size 7512274
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6857bff28723014fc359741f07d82b77ccad062bda89d8b023ae9517dfde3048
3
+ size 8376795
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c6cae8b73c1ba64dd6f8a95d50808d0f96c3442588e0ee7c521e666397beb40
3
+ size 12958324
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa0eff950ed43d1c253f2b1a3f259895bbffa3bee9d2b6502468fe7c3de050b8
3
+ size 18431550
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7974ff13a24110559f516d7f81ffc6072c020bc498ceca28c4e7640bfc930e9e
3
+ size 23921835
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b312eade5aa25054bd7f12b3e19c5d7ea6499e17edc996b6caa4bdcfca0d531a
3
+ size 29333993
evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26e70db861ffdbfc8cf88d452d03a5b56b13a4b70c3a538eb70524271e3f7804
3
+ size 34778872
evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65d4d0227bd4d26f3e9a99d734f3d030328c20d626c6f922920b3dc155977f38
3
+ size 13896009
evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.07031535968691954
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.07031535968691954
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.21317325143407617
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.21317325143407617
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.23375071908985015
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.23375071908985015
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.238646665315093
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.238646665315093
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23937430569965518
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23937430569965518
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.23628094628078694
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.23628094628078694
14
+ e2e_nlg_cleaned,5,average,multiple,0.20525687458439684
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.02780714664643612
16
+ gem_xsum,0,median,rouge2_fmeasure,0.02780714664643612
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04500583650450346
18
+ gem_xsum,1,median,rouge2_fmeasure,0.04500583650450346
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.051882326943507785
20
+ gem_xsum,2,median,rouge2_fmeasure,0.051882326943507785
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05187808695380671
22
+ gem_xsum,3,median,rouge2_fmeasure,0.05187808695380671
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01218356666574454
24
+ gem_xsum,4,median,rouge2_fmeasure,0.01218356666574454
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0
27
+ gem_xsum,5,average,multiple,0.031459493952333106
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.03885973727074715
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.03885973727074715
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0635687916821055
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.0635687916821055
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.07545945954698206
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.07545945954698206
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.08448456167004659
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.08448456167004659
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.08639654396535042
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.08639654396535042
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.09234282351104058
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.09234282351104058
40
+ web_nlg_en,5,average,multiple,0.07351865294104538
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03124497623150672
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03124497623150672
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03442090774378983
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.03442090774378983
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05444986552431719
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.05444986552431719
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04583954485462417
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04583954485462417
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01577378906241384
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.01577378906241384
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0031062476201456015
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0031062476201456015
53
+ wiki_lingua_en,5,average,multiple,0.030805888506132893
evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2562657244023514, "bleu_stderr": 0.028344931968085282, "rouge1_fmeasure": 0.08917647320322222, "rouge1_fmeasure_stderr": 0.001969587935532424, "rouge1_precision": 0.062384701456087446, "rouge1_precision_stderr": 0.0020959572066061704, "rouge1_recall": 0.2778238258177499, "rouge1_recall_stderr": 0.0053586090146941585, "rouge2_fmeasure": 0.03885973727074715, "rouge2_fmeasure_stderr": 0.0011411233576479184, "rouge2_precision": 0.026957871325524504, "rouge2_precision_stderr": 0.00119791534895436, "rouge2_recall": 0.12120656565208492, "rouge2_recall_stderr": 0.003245668709085193, "rougeL_fmeasure": 0.08588431201368854, "rougeL_fmeasure_stderr": 0.0018539010477647254, "rougeL_precision": 0.059948665447718395, "rougeL_precision_stderr": 0.001993918360545074, "rougeL_recall": 0.2696777384075072, "rougeL_recall_stderr": 0.005223094399514082, "rougeLsum_fmeasure": 0.08392617035294098, "rougeLsum_fmeasure_stderr": 0.001881634203344931, "rougeLsum_precision": 0.05914169795430463, "rougeLsum_precision_stderr": 0.0020593373696654797, "rougeLsum_recall": 0.25838316242026105, "rougeLsum_recall_stderr": 0.00491849430291812}}, "1": {"PALM_prompt": {"bleu": 0.44906636509622533, "bleu_stderr": 0.039450588630595015, "rouge1_fmeasure": 0.13316786961188862, "rouge1_fmeasure_stderr": 0.0032035615287408364, "rouge1_precision": 0.11288061517490074, "rouge1_precision_stderr": 0.00386994874457578, "rouge1_recall": 0.27968139569340955, "rouge1_recall_stderr": 0.004818798730073509, "rouge2_fmeasure": 0.0635687916821055, "rouge2_fmeasure_stderr": 0.0020746952988998694, "rouge2_precision": 0.052903771126418274, "rouge2_precision_stderr": 0.0024560083140397187, "rouge2_recall": 0.13670929059101208, "rouge2_recall_stderr": 0.0032752111527625427, "rougeL_fmeasure": 0.12246686808714284, "rougeL_fmeasure_stderr": 0.0027865982212275067, "rougeL_precision": 0.10294903334837356, "rougeL_precision_stderr": 0.0034785078085129637, "rougeL_recall": 0.26443193955147987, "rougeL_recall_stderr": 0.004508721603501755, "rougeLsum_fmeasure": 0.12355055054182587, "rougeLsum_fmeasure_stderr": 0.002841014595626855, "rougeLsum_precision": 0.1043770278231783, "rougeLsum_precision_stderr": 0.0035484629358203635, "rougeLsum_recall": 0.2648398336113232, "rougeLsum_recall_stderr": 0.004503000799632382}}, "2": {"PALM_prompt": {"bleu": 0.4687852352128821, "bleu_stderr": 0.037809750600987985, "rouge1_fmeasure": 0.15485025651871712, "rouge1_fmeasure_stderr": 0.0034302178802564512, "rouge1_precision": 0.12939788800380767, "rouge1_precision_stderr": 0.004056238000462917, "rouge1_recall": 0.3205721940394599, "rouge1_recall_stderr": 0.004784451352989951, "rouge2_fmeasure": 0.07545945954698206, "rouge2_fmeasure_stderr": 0.0021937365311268525, "rouge2_precision": 0.06272684866048131, "rouge2_precision_stderr": 0.002451752863709562, "rouge2_recall": 0.16040449198937512, "rouge2_recall_stderr": 0.003451151403396264, "rougeL_fmeasure": 0.14062442945039813, "rougeL_fmeasure_stderr": 0.0028845693981807787, "rougeL_precision": 0.11538953087200472, "rougeL_precision_stderr": 0.00346589668028597, "rougeL_recall": 0.30175228398138576, "rougeL_recall_stderr": 0.004432821529779081, "rougeLsum_fmeasure": 0.14250354572951518, "rougeLsum_fmeasure_stderr": 0.002959772587239683, "rougeLsum_precision": 0.11767324801231895, "rougeLsum_precision_stderr": 0.0035688465184641432, "rougeLsum_recall": 0.3033452610052125, "rougeLsum_recall_stderr": 0.004447592173686305}}, "3": {"PALM_prompt": {"bleu": 0.5387522714694768, "bleu_stderr": 0.023700066593108533, "rouge1_fmeasure": 0.1660401631118107, "rouge1_fmeasure_stderr": 0.0037496181550606754, "rouge1_precision": 0.14028377513050538, "rouge1_precision_stderr": 0.004341036446200867, "rouge1_recall": 0.33808480225227105, "rouge1_recall_stderr": 0.004840897586860907, "rouge2_fmeasure": 0.08448456167004659, "rouge2_fmeasure_stderr": 0.002619780755266116, "rouge2_precision": 0.07249658632299814, "rouge2_precision_stderr": 0.0029199126351436076, "rouge2_recall": 0.1738659001719329, "rouge2_recall_stderr": 0.003647562909324833, "rougeL_fmeasure": 0.15093871662748365, "rougeL_fmeasure_stderr": 0.0032225789949202046, "rougeL_precision": 0.12529450659786737, "rougeL_precision_stderr": 0.003740919506421422, "rougeL_recall": 0.3182618579312013, "rougeL_recall_stderr": 0.004525730253384199, "rougeLsum_fmeasure": 0.15379137375695537, "rougeLsum_fmeasure_stderr": 0.0033296076273151513, "rougeLsum_precision": 0.12866365606890057, "rougeLsum_precision_stderr": 0.003897701131855631, "rougeLsum_recall": 0.3209266755398485, "rougeLsum_recall_stderr": 0.004550196873941636}}, "4": {"PALM_prompt": {"bleu": 0.6154728892401495, "bleu_stderr": 0.05492052153457257, "rouge1_fmeasure": 0.16918194985742865, "rouge1_fmeasure_stderr": 0.003726050287797417, "rouge1_precision": 0.1416216780254008, "rouge1_precision_stderr": 0.004403560474088744, "rouge1_recall": 0.350369181874358, "rouge1_recall_stderr": 0.004821260815960307, "rouge2_fmeasure": 0.08639654396535042, "rouge2_fmeasure_stderr": 0.0025466510150414677, "rouge2_precision": 0.07377427634131609, "rouge2_precision_stderr": 0.0029769913238928973, "rouge2_recall": 0.18193245830782417, "rouge2_recall_stderr": 0.003657633632618109, "rougeL_fmeasure": 0.15419230913013102, "rougeL_fmeasure_stderr": 0.0031966205747524113, "rougeL_precision": 0.12691040102952247, "rougeL_precision_stderr": 0.0038030608678551996, "rougeL_recall": 0.33016396642156515, "rougeL_recall_stderr": 0.004483672398798779, "rougeLsum_fmeasure": 0.15686161901707832, "rougeLsum_fmeasure_stderr": 0.003303387308503824, "rougeLsum_precision": 0.1301329876623986, "rougeLsum_precision_stderr": 0.003962229410505262, "rougeLsum_recall": 0.3326898264482343, "rougeLsum_recall_stderr": 0.004514627367420683}}, "5": {"PALM_prompt": {"bleu": 0.6280080728696126, "bleu_stderr": 0.04207890973166607, "rouge1_fmeasure": 0.1785781279253249, "rouge1_fmeasure_stderr": 0.003864561091208021, "rouge1_precision": 0.15509132478375567, "rouge1_precision_stderr": 0.004705843597864089, "rouge1_recall": 0.36072408074578627, "rouge1_recall_stderr": 0.004932090632111422, "rouge2_fmeasure": 0.09234282351104058, "rouge2_fmeasure_stderr": 0.0026995310439716595, "rouge2_precision": 0.08180998073756966, "rouge2_precision_stderr": 0.0031797028106942695, "rouge2_recall": 0.1895120629191651, "rouge2_recall_stderr": 0.0037580265385154963, "rougeL_fmeasure": 0.1620344936907299, "rougeL_fmeasure_stderr": 0.003324512248892316, "rougeL_precision": 0.13813720543380648, "rougeL_precision_stderr": 0.004053616600136467, "rougeL_recall": 0.33922496409575775, "rougeL_recall_stderr": 0.004622505715509403, "rougeLsum_fmeasure": 0.16446961325308998, "rougeLsum_fmeasure_stderr": 0.0034245269422962608, "rougeLsum_precision": 0.14168764741861828, "rougeLsum_precision_stderr": 0.004239364453934911, "rougeLsum_recall": 0.34078189394441194, "rougeLsum_recall_stderr": 0.004625313010789047}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4416513225274035, "bleu_stderr": 0.07150581722616807, "rouge1_fmeasure": 0.16843326604027012, "rouge1_fmeasure_stderr": 0.0018787988149176356, "rouge1_precision": 0.14831832571904127, "rouge1_precision_stderr": 0.0020031574591708096, "rouge1_recall": 0.23669379639579727, "rouge1_recall_stderr": 0.002619263402445374, "rouge2_fmeasure": 0.03124497623150672, "rouge2_fmeasure_stderr": 0.000826359540629872, "rouge2_precision": 0.027495279137204204, "rouge2_precision_stderr": 0.0008008675778483577, "rouge2_recall": 0.04558631579219217, "rouge2_recall_stderr": 0.0013034624565288761, "rougeL_fmeasure": 0.1373698337101501, "rougeL_fmeasure_stderr": 0.0014061340796650912, "rougeL_precision": 0.11944058601706234, "rougeL_precision_stderr": 0.0014613554843883904, "rougeL_recall": 0.197662087047554, "rougeL_recall_stderr": 0.0022098401272736727, "rougeLsum_fmeasure": 0.15530559329563356, "rougeLsum_fmeasure_stderr": 0.0017277092690194619, "rougeLsum_precision": 0.13662513471708013, "rougeLsum_precision_stderr": 0.001834224913870075, "rougeLsum_recall": 0.21848967035350617, "rougeLsum_recall_stderr": 0.00242500324850602}}, "1": {"tldr_en": {"bleu": 1.9112541973999004, "bleu_stderr": 0.08163120654641273, "rouge1_fmeasure": 0.1595930665634592, "rouge1_fmeasure_stderr": 0.0020260063889951897, "rouge1_precision": 0.2474989333475308, "rouge1_precision_stderr": 0.004192775409890903, "rouge1_recall": 0.16589702996724112, "rouge1_recall_stderr": 0.002477684211999076, "rouge2_fmeasure": 0.03442090774378983, "rouge2_fmeasure_stderr": 0.0011197085684525982, "rouge2_precision": 0.06456439217534583, "rouge2_precision_stderr": 0.0026036519415425876, "rouge2_recall": 0.03395563608028862, "rouge2_recall_stderr": 0.0011867665512589468, "rougeL_fmeasure": 0.12543184498335516, "rougeL_fmeasure_stderr": 0.0015786460435038327, "rougeL_precision": 0.2007100384592662, "rougeL_precision_stderr": 0.0036402244436710527, "rougeL_recall": 0.12968060330920403, "rougeL_recall_stderr": 0.0019100623389590725, "rougeLsum_fmeasure": 0.15113313197846198, "rougeLsum_fmeasure_stderr": 0.0019006022698345565, "rougeLsum_precision": 0.2356687196157363, "rougeLsum_precision_stderr": 0.004040097148447038, "rougeLsum_recall": 0.15692818247747958, "rougeLsum_recall_stderr": 0.0023149233432171795}}, "2": {"tldr_en": {"bleu": 2.4920293719291493, "bleu_stderr": 0.06820740373159184, "rouge1_fmeasure": 0.20013093911819188, "rouge1_fmeasure_stderr": 0.0021449192871676984, "rouge1_precision": 0.3671552578777415, "rouge1_precision_stderr": 0.00464310641179873, "rouge1_recall": 0.18316424412100443, "rouge1_recall_stderr": 0.002534818661881543, "rouge2_fmeasure": 0.05444986552431719, "rouge2_fmeasure_stderr": 0.0012835369876590637, "rouge2_precision": 0.11652986149569292, "rouge2_precision_stderr": 0.0033165278944766326, "rouge2_recall": 0.048557963351484004, "rouge2_recall_stderr": 0.0012897480876968325, "rougeL_fmeasure": 0.1600577177279773, "rougeL_fmeasure_stderr": 0.0017194821563251488, "rougeL_precision": 0.3030100733810117, "rougeL_precision_stderr": 0.004160446544180668, "rougeL_recall": 0.1454519391726235, "rougeL_recall_stderr": 0.0020102539956042787, "rougeLsum_fmeasure": 0.18899973850835908, "rougeLsum_fmeasure_stderr": 0.0020281161914398288, "rougeLsum_precision": 0.34948494261958846, "rougeLsum_precision_stderr": 0.00452044298551405, "rougeLsum_recall": 0.17241597783925564, "rougeLsum_recall_stderr": 0.0023681037127358745}}, "3": {"tldr_en": {"bleu": 1.6055821821151304, "bleu_stderr": 0.06565825928935355, "rouge1_fmeasure": 0.16433308908585867, "rouge1_fmeasure_stderr": 0.002402134739956357, "rouge1_precision": 0.31853023306817924, "rouge1_precision_stderr": 0.005101746577027814, "rouge1_recall": 0.14722792301810253, "rouge1_recall_stderr": 0.0026467115286204013, "rouge2_fmeasure": 0.04583954485462417, "rouge2_fmeasure_stderr": 0.0012897030660356447, "rouge2_precision": 0.10260882729326853, "rouge2_precision_stderr": 0.0033488070271773295, "rouge2_recall": 0.04037648707007591, "rouge2_recall_stderr": 0.0012615029724490608, "rougeL_fmeasure": 0.13364146877613045, "rougeL_fmeasure_stderr": 0.00195553742839865, "rougeL_precision": 0.26690611784240603, "rougeL_precision_stderr": 0.004530048771512807, "rougeL_recall": 0.11874183155575303, "rougeL_recall_stderr": 0.0021251987841602225, "rougeLsum_fmeasure": 0.15587618164528091, "rougeLsum_fmeasure_stderr": 0.002273493914941302, "rougeLsum_precision": 0.3046492013868888, "rougeLsum_precision_stderr": 0.004949401564936648, "rougeLsum_recall": 0.1393491071946238, "rougeLsum_recall_stderr": 0.0024976587731928646}}, "4": {"tldr_en": {"bleu": 0.006932560692974423, "bleu_stderr": 0.0017990239865701577, "rouge1_fmeasure": 0.05455257649733116, "rouge1_fmeasure_stderr": 0.002012404693346845, "rouge1_precision": 0.10914047578759525, "rouge1_precision_stderr": 0.004168163234199949, "rouge1_recall": 0.04875414041264775, "rouge1_recall_stderr": 0.0020018963702338544, "rouge2_fmeasure": 0.01577378906241384, "rouge2_fmeasure_stderr": 0.0009385589100462621, "rouge2_precision": 0.037261877113724175, "rouge2_precision_stderr": 0.002437336913811512, "rouge2_recall": 0.013448412424643126, "rouge2_recall_stderr": 0.0008658300088105147, "rougeL_fmeasure": 0.045100032510421204, "rougeL_fmeasure_stderr": 0.0016666399875103265, "rougeL_precision": 0.09325777757588789, "rougeL_precision_stderr": 0.003680574837942309, "rougeL_recall": 0.03991105511182912, "rougeL_recall_stderr": 0.0016279952050045603, "rougeLsum_fmeasure": 0.051422475813845084, "rougeLsum_fmeasure_stderr": 0.0018931617381586926, "rougeLsum_precision": 0.10390247551362326, "rougeLsum_precision_stderr": 0.004006559051889402, "rougeLsum_recall": 0.04597031630948619, "rougeLsum_recall_stderr": 0.0018861168846922784}}, "5": {"tldr_en": {"bleu": 1.5486400435190646e-20, "bleu_stderr": 5.928974165985541e-18, "rouge1_fmeasure": 0.009250948371284507, "rouge1_fmeasure_stderr": 0.0009365636508956353, "rouge1_precision": 0.019024182319545703, "rouge1_precision_stderr": 0.0019567530452185085, "rouge1_recall": 0.00809080049652358, "rouge1_recall_stderr": 0.0008930277662354756, "rouge2_fmeasure": 0.0031062476201456015, "rouge2_fmeasure_stderr": 0.00043823448085888057, "rouge2_precision": 0.007878162042298584, "rouge2_precision_stderr": 0.0012119853940782482, "rouge2_recall": 0.0024981499124714754, "rouge2_recall_stderr": 0.00036249889299460995, "rougeL_fmeasure": 0.007784924334651645, "rougeL_fmeasure_stderr": 0.0007922903839285351, "rougeL_precision": 0.016781942931569222, "rougeL_precision_stderr": 0.0017964368358031513, "rougeL_recall": 0.006703878314586593, "rougeL_recall_stderr": 0.0007330971808896851, "rougeLsum_fmeasure": 0.008943947340808445, "rougeLsum_fmeasure_stderr": 0.0009064353361518755, "rougeLsum_precision": 0.01855296243342967, "rougeLsum_precision_stderr": 0.001923182050974051, "rougeLsum_recall": 0.00783323994370169, "rougeLsum_recall_stderr": 0.0008689584836341853}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.9626050306721474, "bleu_stderr": 0.06024253979194259, "rouge1_fmeasure": 0.20284230759257002, "rouge1_fmeasure_stderr": 0.0017464348006575089, "rouge1_precision": 0.1540985111881967, "rouge1_precision_stderr": 0.0014515620097711474, "rouge1_recall": 0.31140262883046665, "rouge1_recall_stderr": 0.0024251988602698067, "rouge2_fmeasure": 0.07031535968691954, "rouge2_fmeasure_stderr": 0.001168196298024764, "rouge2_precision": 0.05355018382472272, "rouge2_precision_stderr": 0.0009106172876128309, "rouge2_recall": 0.10690154622755875, "rouge2_recall_stderr": 0.0017688696499961142, "rougeL_fmeasure": 0.1753585977687643, "rougeL_fmeasure_stderr": 0.0014608296346256319, "rougeL_precision": 0.1327617315507999, "rougeL_precision_stderr": 0.001197611976138011, "rougeL_recall": 0.2711114650902915, "rougeL_recall_stderr": 0.0021100724050297824, "rougeLsum_fmeasure": 0.18282034804175978, "rougeLsum_fmeasure_stderr": 0.0016241979991028408, "rougeLsum_precision": 0.1387916405736721, "rougeLsum_precision_stderr": 0.0013406445609266567, "rougeLsum_recall": 0.2810609872616038, "rougeLsum_recall_stderr": 0.0022804790016154107}}, "1": {"generate_text_restaurant": {"bleu": 11.593827123577368, "bleu_stderr": 0.12407021076125696, "rouge1_fmeasure": 0.45020216196031243, "rouge1_fmeasure_stderr": 0.0023747076071133698, "rouge1_precision": 0.5509122991009537, "rouge1_precision_stderr": 0.0033661250504397264, "rouge1_recall": 0.42140078880413323, "rouge1_recall_stderr": 0.002953747532093766, "rouge2_fmeasure": 0.21317325143407617, "rouge2_fmeasure_stderr": 0.0020127889592610262, "rouge2_precision": 0.2650742428626904, "rouge2_precision_stderr": 0.0027402930606959166, "rouge2_recall": 0.19921132653073137, "rouge2_recall_stderr": 0.0021274437677356846, "rougeL_fmeasure": 0.32668549807617586, "rougeL_fmeasure_stderr": 0.002086318029186934, "rougeL_precision": 0.40311524575775876, "rougeL_precision_stderr": 0.003054424817440509, "rougeL_recall": 0.30479322858663915, "rougeL_recall_stderr": 0.0023980133198678024, "rougeLsum_fmeasure": 0.3686111369512851, "rougeLsum_fmeasure_stderr": 0.0023418416254223887, "rougeLsum_precision": 0.4523945212247963, "rougeLsum_precision_stderr": 0.00326815326036064, "rougeLsum_recall": 0.3446091324717908, "rougeLsum_recall_stderr": 0.0027191831013193444}}, "2": {"generate_text_restaurant": {"bleu": 12.88190096920724, "bleu_stderr": 0.17583922253090617, "rouge1_fmeasure": 0.4721264666575481, "rouge1_fmeasure_stderr": 0.002268547262390229, "rouge1_precision": 0.5822720419906148, "rouge1_precision_stderr": 0.0032962978807536585, "rouge1_recall": 0.4337280880183242, "rouge1_recall_stderr": 0.0028431977686949614, "rouge2_fmeasure": 0.23375071908985015, "rouge2_fmeasure_stderr": 0.0020523859536162883, "rouge2_precision": 0.2931157748619445, "rouge2_precision_stderr": 0.0028165108542885105, "rouge2_recall": 0.2147077046167617, "rouge2_recall_stderr": 0.0021786795843148085, "rougeL_fmeasure": 0.3522716641150115, "rougeL_fmeasure_stderr": 0.0020991364053741914, "rougeL_precision": 0.4369685674136242, "rougeL_precision_stderr": 0.003073517456588823, "rougeL_recall": 0.32297689774894583, "rougeL_recall_stderr": 0.0024097063140581243, "rougeLsum_fmeasure": 0.393682260751447, "rougeLsum_fmeasure_stderr": 0.0023007053568388975, "rougeLsum_precision": 0.4863687819740207, "rougeLsum_precision_stderr": 0.0032507530347318466, "rougeLsum_recall": 0.3613566974986443, "rougeLsum_recall_stderr": 0.0026568660208198115}}, "3": {"generate_text_restaurant": {"bleu": 13.262436270778784, "bleu_stderr": 0.12966668718803287, "rouge1_fmeasure": 0.47525930406845607, "rouge1_fmeasure_stderr": 0.0022663496963600087, "rouge1_precision": 0.5813922477147344, "rouge1_precision_stderr": 0.0032594641701596022, "rouge1_recall": 0.43750260743630337, "rouge1_recall_stderr": 0.002850951212768957, "rouge2_fmeasure": 0.238646665315093, "rouge2_fmeasure_stderr": 0.0020938647359093243, "rouge2_precision": 0.2963848070406136, "rouge2_precision_stderr": 0.002811561987291426, "rouge2_recall": 0.2198532494536229, "rouge2_recall_stderr": 0.0022399911777690423, "rougeL_fmeasure": 0.3550920935079866, "rougeL_fmeasure_stderr": 0.002168420115905195, "rougeL_precision": 0.43608943087396823, "rougeL_precision_stderr": 0.0030728003407875415, "rougeL_recall": 0.32653357863256616, "rougeL_recall_stderr": 0.002478214580278462, "rougeLsum_fmeasure": 0.3982122870175536, "rougeLsum_fmeasure_stderr": 0.002333698912202139, "rougeLsum_precision": 0.4873959951365985, "rougeLsum_precision_stderr": 0.0032309608357424142, "rougeLsum_recall": 0.3665706405925854, "rougeLsum_recall_stderr": 0.0027071065833861527}}, "4": {"generate_text_restaurant": {"bleu": 13.476662518674425, "bleu_stderr": 0.14804924963101374, "rouge1_fmeasure": 0.4751483029381261, "rouge1_fmeasure_stderr": 0.0022939871226639315, "rouge1_precision": 0.575148309756594, "rouge1_precision_stderr": 0.0032456743499240143, "rouge1_recall": 0.4379553251280355, "rouge1_recall_stderr": 0.0027875515895071923, "rouge2_fmeasure": 0.23937430569965518, "rouge2_fmeasure_stderr": 0.0021223157395609733, "rouge2_precision": 0.2932213385624628, "rouge2_precision_stderr": 0.0027850927611603377, "rouge2_recall": 0.220787156975807, "rouge2_recall_stderr": 0.0022308643708334754, "rougeL_fmeasure": 0.35385483221937813, "rougeL_fmeasure_stderr": 0.0021780339813240314, "rougeL_precision": 0.4297630033921168, "rougeL_precision_stderr": 0.0030250335936550877, "rougeL_recall": 0.3260216890650744, "rougeL_recall_stderr": 0.0024522487516979575, "rougeLsum_fmeasure": 0.3976384682503085, "rougeLsum_fmeasure_stderr": 0.0023729927223769785, "rougeLsum_precision": 0.4813876805218902, "rougeLsum_precision_stderr": 0.0032181540128894265, "rougeLsum_recall": 0.36675888786779215, "rougeLsum_recall_stderr": 0.002699461696539119}}, "5": {"generate_text_restaurant": {"bleu": 13.156897203455948, "bleu_stderr": 0.08630136341873869, "rouge1_fmeasure": 0.4729751649957838, "rouge1_fmeasure_stderr": 0.0022801925622949843, "rouge1_precision": 0.5738627336760422, "rouge1_precision_stderr": 0.0032894505336520264, "rouge1_recall": 0.4353618479598648, "rouge1_recall_stderr": 0.0027767731270488508, "rouge2_fmeasure": 0.23628094628078694, "rouge2_fmeasure_stderr": 0.00211214055033469, "rouge2_precision": 0.29086794940233046, "rouge2_precision_stderr": 0.002818639353471394, "rouge2_recall": 0.21741764823749096, "rouge2_recall_stderr": 0.002221377915423391, "rougeL_fmeasure": 0.3534318578391924, "rougeL_fmeasure_stderr": 0.0021797527155027353, "rougeL_precision": 0.4304412705229436, "rougeL_precision_stderr": 0.003073791049621951, "rougeL_recall": 0.32508975084283026, "rougeL_recall_stderr": 0.0024528402950547467, "rougeLsum_fmeasure": 0.39642244501779056, "rougeLsum_fmeasure_stderr": 0.002354679808047655, "rougeLsum_precision": 0.4815893698288744, "rougeLsum_precision_stderr": 0.0032662227256431515, "rougeLsum_recall": 0.36475434401700674, "rougeLsum_recall_stderr": 0.00266853275926591}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.0285249742430023, "bleu_stderr": 0.04658561770880105, "rouge1_fmeasure": 0.1478842341216362, "rouge1_fmeasure_stderr": 0.0027472203051295764, "rouge1_precision": 0.10711262762227787, "rouge1_precision_stderr": 0.0020601915536585967, "rouge1_recall": 0.25337181483295146, "rouge1_recall_stderr": 0.0046856436696164905, "rouge2_fmeasure": 0.02780714664643612, "rouge2_fmeasure_stderr": 0.0012134748561574466, "rouge2_precision": 0.019781013833534398, "rouge2_precision_stderr": 0.0008722452654034713, "rouge2_recall": 0.04912277748197487, "rouge2_recall_stderr": 0.0021576559260432712, "rougeL_fmeasure": 0.11266163818049864, "rougeL_fmeasure_stderr": 0.0020299599299667603, "rougeL_precision": 0.08153762034631415, "rougeL_precision_stderr": 0.0015205040880351287, "rougeL_recall": 0.19377740656973594, "rougeL_recall_stderr": 0.0035212605780510554, "rougeLsum_fmeasure": 0.11968068511417429, "rougeLsum_fmeasure_stderr": 0.0021748845339722685, "rougeLsum_precision": 0.08657778420332544, "rougeLsum_precision_stderr": 0.0016263154802718866, "rougeLsum_recall": 0.20583552514369016, "rougeLsum_recall_stderr": 0.003771689144281502}}, "1": {"article_DOC_summary": {"bleu": 2.1792875082477123, "bleu_stderr": 0.06319420014636688, "rouge1_fmeasure": 0.2183934186796248, "rouge1_fmeasure_stderr": 0.0031131023395006203, "rouge1_precision": 0.2154872949099143, "rouge1_precision_stderr": 0.0037469904189591395, "rouge1_recall": 0.25867586586469277, "rouge1_recall_stderr": 0.004134512250176692, "rouge2_fmeasure": 0.04500583650450346, "rouge2_fmeasure_stderr": 0.001863892149786922, "rouge2_precision": 0.0447245429308555, "rouge2_precision_stderr": 0.0020714108653434506, "rouge2_recall": 0.05502827731187604, "rouge2_recall_stderr": 0.0023758194054523916, "rougeL_fmeasure": 0.1635144970491249, "rougeL_fmeasure_stderr": 0.0024820064917382306, "rougeL_precision": 0.1614628963956234, "rougeL_precision_stderr": 0.0030120325913654804, "rougeL_recall": 0.19472956411902964, "rougeL_recall_stderr": 0.0033235017592203077, "rougeLsum_fmeasure": 0.1671666603817299, "rougeLsum_fmeasure_stderr": 0.0025413985087851003, "rougeLsum_precision": 0.16439844895006708, "rougeLsum_precision_stderr": 0.003021699034691555, "rougeLsum_recall": 0.2002131667137321, "rougeLsum_recall_stderr": 0.003522860713518912}}, "2": {"article_DOC_summary": {"bleu": 2.9719119603418074, "bleu_stderr": 0.16832095088560176, "rouge1_fmeasure": 0.23579050025596546, "rouge1_fmeasure_stderr": 0.0032208877086972645, "rouge1_precision": 0.24578221078853943, "rouge1_precision_stderr": 0.0038311545433087246, "rouge1_recall": 0.25247232169607386, "rouge1_recall_stderr": 0.003851791759690215, "rouge2_fmeasure": 0.051882326943507785, "rouge2_fmeasure_stderr": 0.002120030804437139, "rouge2_precision": 0.05385976155670831, "rouge2_precision_stderr": 0.002293715583911734, "rouge2_recall": 0.05655894066313464, "rouge2_recall_stderr": 0.002406489537857907, "rougeL_fmeasure": 0.17839101172174274, "rougeL_fmeasure_stderr": 0.0026918945920387577, "rougeL_precision": 0.18601506569279253, "rougeL_precision_stderr": 0.0031620641585856746, "rougeL_recall": 0.19159840411716447, "rougeL_recall_stderr": 0.003222530909275831, "rougeLsum_fmeasure": 0.18050054360244336, "rougeLsum_fmeasure_stderr": 0.002709070917463213, "rougeLsum_precision": 0.18783807621807994, "rougeLsum_precision_stderr": 0.003156835477899629, "rougeLsum_recall": 0.19452321319767416, "rougeLsum_recall_stderr": 0.0033253733370942227}}, "3": {"article_DOC_summary": {"bleu": 3.2041870719788714, "bleu_stderr": 0.20611661341919532, "rouge1_fmeasure": 0.2277409702430376, "rouge1_fmeasure_stderr": 0.0036665816219810165, "rouge1_precision": 0.24477878269919434, "rouge1_precision_stderr": 0.0043304256076557015, "rouge1_recall": 0.23546035253406764, "rouge1_recall_stderr": 0.0040420090423460295, "rouge2_fmeasure": 0.05187808695380671, "rouge2_fmeasure_stderr": 0.0022736997047107283, "rouge2_precision": 0.05535540959023755, "rouge2_precision_stderr": 0.002520852290920342, "rouge2_recall": 0.05400488265902618, "rouge2_recall_stderr": 0.0023942103295597683, "rougeL_fmeasure": 0.1726765296689622, "rougeL_fmeasure_stderr": 0.0030037675906149446, "rougeL_precision": 0.18676731153569484, "rougeL_precision_stderr": 0.0035994106495257387, "rougeL_recall": 0.17809432971388495, "rougeL_recall_stderr": 0.003241778335625792, "rougeLsum_fmeasure": 0.1742948037879305, "rougeLsum_fmeasure_stderr": 0.0030227160469161974, "rougeLsum_precision": 0.1881533844870454, "rougeLsum_precision_stderr": 0.003601435982381053, "rougeLsum_recall": 0.18040945162861388, "rougeLsum_recall_stderr": 0.0033378628217040685}}, "4": {"article_DOC_summary": {"bleu": 0.114133025618453, "bleu_stderr": 0.03902166340342278, "rouge1_fmeasure": 0.05638135159346031, "rouge1_fmeasure_stderr": 0.0033862796262482922, "rouge1_precision": 0.06536803730199292, "rouge1_precision_stderr": 0.0040080446277792475, "rouge1_recall": 0.0562578647719299, "rouge1_recall_stderr": 0.003466710519052521, "rouge2_fmeasure": 0.01218356666574454, "rouge2_fmeasure_stderr": 0.0012325430366165824, "rouge2_precision": 0.013453387419618108, "rouge2_precision_stderr": 0.0014082914388032289, "rouge2_recall": 0.012484556808642255, "rouge2_recall_stderr": 0.0012859869239756115, "rougeL_fmeasure": 0.042197213021247744, "rougeL_fmeasure_stderr": 0.0025684294584781297, "rougeL_precision": 0.049695294130091966, "rougeL_precision_stderr": 0.0031067522139507655, "rougeL_recall": 0.04191724945748147, "rougeL_recall_stderr": 0.0026229390757436582, "rougeLsum_fmeasure": 0.042656324273577836, "rougeLsum_fmeasure_stderr": 0.002590374442922286, "rougeLsum_precision": 0.0501407378152416, "rougeLsum_precision_stderr": 0.00312422462775062, "rougeLsum_recall": 0.04244935232602974, "rougeLsum_recall_stderr": 0.0026544789695008624}}, "5": {"article_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}}}
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.2562657244023514,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.028344931968085282
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.062384701456087446,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0020959572066061704
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.2778238258177499,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0053586090146941585
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.08917647320322222,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.001969587935532424
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.026957871325524504,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.00119791534895436
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.12120656565208492,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003245668709085193
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.03885973727074715,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0011411233576479184
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.059948665447718395,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.001993918360545074
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.2696777384075072,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.005223094399514082
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.08588431201368854,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0018539010477647254
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.05914169795430463,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0020593373696654797
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.25838316242026105,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.00491849430291812
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.08392617035294098,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.001881634203344931
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 0,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.44906636509622533,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.039450588630595015
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.11288061517490074,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.00386994874457578
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.27968139569340955,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.004818798730073509
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.13316786961188862,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0032035615287408364
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.052903771126418274,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0024560083140397187
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.13670929059101208,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0032752111527625427
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.0635687916821055,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0020746952988998694
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.10294903334837356,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0034785078085129637
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.26443193955147987,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004508721603501755
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.12246686808714284,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0027865982212275067
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.1043770278231783,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0035484629358203635
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.2648398336113232,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004503000799632382
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.12355055054182587,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.002841014595626855
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 1,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.4687852352128821,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.037809750600987985
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.12939788800380767,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.004056238000462917
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.3205721940394599,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.004784451352989951
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.15485025651871712,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0034302178802564512
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.06272684866048131,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.002451752863709562
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.16040449198937512,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003451151403396264
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.07545945954698206,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0021937365311268525
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.11538953087200472,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.00346589668028597
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.30175228398138576,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004432821529779081
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.14062442945039813,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0028845693981807787
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.11767324801231895,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0035688465184641432
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.3033452610052125,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004447592173686305
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.14250354572951518,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.002959772587239683
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.5387522714694768,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.023700066593108533
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.14028377513050538,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.004341036446200867
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.33808480225227105,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.004840897586860907
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.1660401631118107,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0037496181550606754
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.07249658632299814,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0029199126351436076
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.1738659001719329,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003647562909324833
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.08448456167004659,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.002619780755266116
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.12529450659786737,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.003740919506421422
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.3182618579312013,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004525730253384199
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.15093871662748365,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0032225789949202046
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.12866365606890057,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.003897701131855631
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.3209266755398485,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004550196873941636
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.15379137375695537,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0033296076273151513
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.6154728892401495,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.05492052153457257
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.1416216780254008,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.004403560474088744
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.350369181874358,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.004821260815960307
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.16918194985742865,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.003726050287797417
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.07377427634131609,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0029769913238928973
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.18193245830782417,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003657633632618109
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.08639654396535042,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0025466510150414677
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.12691040102952247,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0038030608678551996
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.33016396642156515,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004483672398798779
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.15419230913013102,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0031966205747524113
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.1301329876623986,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.003962229410505262
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.3326898264482343,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004514627367420683
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.15686161901707832,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.003303387308503824
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.6280080728696126,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.04207890973166607
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.15509132478375567,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.004705843597864089
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.36072408074578627,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.004932090632111422
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.1785781279253249,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.003864561091208021
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.08180998073756966,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0031797028106942695
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.1895120629191651,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0037580265385154963
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.09234282351104058,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0026995310439716595
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.13813720543380648,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.004053616600136467
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.33922496409575775,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004622505715509403
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.1620344936907299,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.003324512248892316
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.14168764741861828,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.004239364453934911
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.34078189394441194,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004625313010789047
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.16446961325308998,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0034245269422962608
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.14831832571904127,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0020031574591708096
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.23669379639579727,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002619263402445374
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.16843326604027012,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0018787988149176356
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.027495279137204204,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0008008675778483577
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.04558631579219217,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0013034624565288761
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.03124497623150672,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.000826359540629872
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.11944058601706234,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0014613554843883904
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.197662087047554,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0022098401272736727
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.1373698337101501,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0014061340796650912
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.13662513471708013,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.001834224913870075
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.21848967035350617,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.00242500324850602
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.15530559329563356,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0017277092690194619
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 1.4416513225274035,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.07150581722616807
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 0,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.2474989333475308,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.004192775409890903
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.16589702996724112,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002477684211999076
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.1595930665634592,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0020260063889951897
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.06456439217534583,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0026036519415425876
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.03395563608028862,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0011867665512589468
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.03442090774378983,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0011197085684525982
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.2007100384592662,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0036402244436710527
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.12968060330920403,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0019100623389590725
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.12543184498335516,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0015786460435038327
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.2356687196157363,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.004040097148447038
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.15692818247747958,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0023149233432171795
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.15113313197846198,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0019006022698345565
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 1.9112541973999004,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.08163120654641273
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 1,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.3671552578777415,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.00464310641179873
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.18316424412100443,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002534818661881543
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.20013093911819188,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0021449192871676984
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.11652986149569292,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0033165278944766326
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.048557963351484004,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0012897480876968325
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.05444986552431719,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0012835369876590637
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.3030100733810117,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.004160446544180668
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.1454519391726235,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0020102539956042787
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.1600577177279773,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0017194821563251488
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.34948494261958846,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.00452044298551405
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.17241597783925564,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0023681037127358745
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.18899973850835908,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0020281161914398288
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 2.4920293719291493,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.06820740373159184
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.31853023306817924,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.005101746577027814
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.14722792301810253,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0026467115286204013
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.16433308908585867,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.002402134739956357
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.10260882729326853,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0033488070271773295
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.04037648707007591,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0012615029724490608
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.04583954485462417,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0012897030660356447
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.26690611784240603,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.004530048771512807
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.11874183155575303,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0021251987841602225
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.13364146877613045,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.00195553742839865
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.3046492013868888,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.004949401564936648
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.1393491071946238,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0024976587731928646
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.15587618164528091,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.002273493914941302
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 1.6055821821151304,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.06565825928935355
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.10914047578759525,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.004168163234199949
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.04875414041264775,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0020018963702338544
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.05455257649733116,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.002012404693346845
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.037261877113724175,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.002437336913811512
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.013448412424643126,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0008658300088105147
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.01577378906241384,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0009385589100462621
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.09325777757588789,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.003680574837942309
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.03991105511182912,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0016279952050045603
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.045100032510421204,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0016666399875103265
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.10390247551362326,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.004006559051889402
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.04597031630948619,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0018861168846922784
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.051422475813845084,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0018931617381586926
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 0.006932560692974423,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.0017990239865701577
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.019024182319545703,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0019567530452185085
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.00809080049652358,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0008930277662354756
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.009250948371284507,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0009365636508956353
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.007878162042298584,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0012119853940782482
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.0024981499124714754,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.00036249889299460995
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.0031062476201456015,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.00043823448085888057
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.016781942931569222,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0017964368358031513
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.006703878314586593,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0007330971808896851
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.007784924334651645,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0007922903839285351
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.01855296243342967,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.001923182050974051
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.00783323994370169,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0008689584836341853
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.008943947340808445,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0009064353361518755
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 1.5486400435190646e-20,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 5.928974165985541e-18
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.06536803730199292,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0040080446277792475
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.0562578647719299,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.003466710519052521
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.05638135159346031,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0033862796262482922
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.013453387419618108,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0014082914388032289
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.012484556808642255,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0012859869239756115
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.01218356666574454,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0012325430366165824
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.049695294130091966,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0031067522139507655
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.04191724945748147,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0026229390757436582
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.042197213021247744,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0025684294584781297
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.0501407378152416,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.00312422462775062
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.04244935232602974,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0026544789695008624
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.042656324273577836,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.002590374442922286
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.114133025618453,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.03902166340342278
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.0,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.0,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.0,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.0,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.0,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.0,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.0,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.0,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.0,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.0,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.0,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.0,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.0
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
evaluation/rankeval/lm1-2b8-55b-oscarroots_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.322,0.014782913600996673,0
3
+ anli_r2,acc,0.331,0.014888272588203936,0
4
+ anli_r3,acc,0.3358333333333333,0.013639261190932887,0
5
+ arc_challenge,acc,0.2380546075085324,0.012445770028026208,0
6
+ arc_challenge,acc_norm,0.2627986348122867,0.012862523175351335,0
7
+ arc_easy,acc,0.5513468013468014,0.010205540414612862,0
8
+ arc_easy,acc_norm,0.49326599326599324,0.010258852980991825,0
9
+ boolq,acc,0.5850152905198777,0.008617716361921567,1
10
+ cb,acc,0.35714285714285715,0.0646095738380922,1
11
+ cb,f1,0.24888576120103215,,1
12
+ copa,acc,0.68,0.046882617226215034,0
13
+ hellaswag,acc,0.38169687313284206,0.004848099661619686,0
14
+ hellaswag,acc_norm,0.47689703246365267,0.004984452002563925,0
15
+ piqa,acc,0.7121871599564744,0.01056325038305919,0
16
+ piqa,acc_norm,0.7094668117519043,0.010592765034696534,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.804,0.012559527926707378,0
19
+ sciq,acc_norm,0.722,0.014174516461485247,0
20
+ storycloze_2016,acc,0.6654195617316943,0.01091131896712794,0
21
+ winogrande,acc,0.5138121546961326,0.014047122916440415,0
evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.68,
22
  "acc_stderr": 0.046882617226215034
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.68,
22
  "acc_stderr": 0.046882617226215034
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.38169687313284206,
26
+ "acc_stderr": 0.004848099661619686,
27
+ "acc_norm": 0.47689703246365267,
28
+ "acc_norm_stderr": 0.004984452002563925
29
+ },
30
+ "rte": {
31
+ "acc": 0.5270758122743683,
32
+ "acc_stderr": 0.030052303463143706
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5138121546961326,
36
+ "acc_stderr": 0.014047122916440415
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6654195617316943,
40
+ "acc_stderr": 0.01091131896712794
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5850152905198777,
44
+ "acc_stderr": 0.008617716361921567
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5513468013468014,
48
+ "acc_stderr": 0.010205540414612862,
49
+ "acc_norm": 0.49326599326599324,
50
+ "acc_norm_stderr": 0.010258852980991825
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2380546075085324,
54
+ "acc_stderr": 0.012445770028026208,
55
+ "acc_norm": 0.2627986348122867,
56
+ "acc_norm_stderr": 0.012862523175351335
57
+ },
58
+ "sciq": {
59
+ "acc": 0.804,
60
+ "acc_stderr": 0.012559527926707378,
61
+ "acc_norm": 0.722,
62
+ "acc_norm_stderr": 0.014174516461485247
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7121871599564744,
66
+ "acc_stderr": 0.01056325038305919,
67
+ "acc_norm": 0.7094668117519043,
68
+ "acc_norm_stderr": 0.010592765034696534
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
evaluation/rankeval/lm1-2b8-55b-oscarroots_0_lm-eval_global_step52452_2023-02-25-11-16-27_0shots_backup.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.322,
5
- "acc_stderr": 0.014782913600996673
6
- },
7
- "anli_r2": {
8
- "acc": 0.331,
9
- "acc_stderr": 0.014888272588203936
10
- },
11
- "anli_r3": {
12
- "acc": 0.3358333333333333,
13
- "acc_stderr": 0.013639261190932887
14
- },
15
- "cb": {
16
- "acc": 0.35714285714285715,
17
- "acc_stderr": 0.0646095738380922,
18
- "f1": 0.24888576120103215
19
- },
20
- "copa": {
21
- "acc": 0.68,
22
- "acc_stderr": 0.046882617226215034
23
- }
24
- },
25
- "versions": {
26
- "anli_r1": 0,
27
- "anli_r2": 0,
28
- "anli_r3": 0,
29
- "cb": 1,
30
- "copa": 0
31
- }
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/rankeval/lm1-2b8-55b-oscarroots_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.303,0.014539683710535255,0
3
+ anli_r2,acc,0.319,0.014746404865473486,0
4
+ anli_r3,acc,0.33416666666666667,0.013622434813136781,0
5
+ arc_challenge,acc,0.2431740614334471,0.012536554144587092,0
6
+ arc_challenge,acc_norm,0.28071672354948807,0.013131238126975583,0
7
+ arc_easy,acc,0.5631313131313131,0.010177672928157681,0
8
+ arc_easy,acc_norm,0.5273569023569024,0.010244415164390529,0
9
+ boolq,acc,0.5813455657492355,0.008628545022868554,1
10
+ cb,acc,0.48214285714285715,0.06737697508644648,1
11
+ cb,f1,0.3270348837209302,,1
12
+ copa,acc,0.66,0.04760952285695237,0
13
+ hellaswag,acc,0.3815972913762199,0.004847857546957478,0
14
+ hellaswag,acc_norm,0.477096195976897,0.004984543540932335,0
15
+ piqa,acc,0.7078346028291621,0.010610252174513658,0
16
+ piqa,acc_norm,0.6996735582154516,0.010695225308183145,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.871,0.010605256784796565,0
19
+ sciq,acc_norm,0.861,0.010945263761042963,0
20
+ storycloze_2016,acc,0.655264564404062,0.01099083028205749,0
21
+ winogrande,acc,0.5414364640883977,0.0140041468537919,0
evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.66,
22
  "acc_stderr": 0.04760952285695237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.66,
22
  "acc_stderr": 0.04760952285695237
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.3815972913762199,
26
+ "acc_stderr": 0.004847857546957478,
27
+ "acc_norm": 0.477096195976897,
28
+ "acc_norm_stderr": 0.004984543540932335
29
+ },
30
+ "rte": {
31
+ "acc": 0.5306859205776173,
32
+ "acc_stderr": 0.03003973059219781
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5414364640883977,
36
+ "acc_stderr": 0.0140041468537919
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.655264564404062,
40
+ "acc_stderr": 0.01099083028205749
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5813455657492355,
44
+ "acc_stderr": 0.008628545022868554
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5631313131313131,
48
+ "acc_stderr": 0.010177672928157681,
49
+ "acc_norm": 0.5273569023569024,
50
+ "acc_norm_stderr": 0.010244415164390529
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2431740614334471,
54
+ "acc_stderr": 0.012536554144587092,
55
+ "acc_norm": 0.28071672354948807,
56
+ "acc_norm_stderr": 0.013131238126975583
57
+ },
58
+ "sciq": {
59
+ "acc": 0.871,
60
+ "acc_stderr": 0.010605256784796565,
61
+ "acc_norm": 0.861,
62
+ "acc_norm_stderr": 0.010945263761042963
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7078346028291621,
66
+ "acc_stderr": 0.010610252174513658,
67
+ "acc_norm": 0.6996735582154516,
68
+ "acc_norm_stderr": 0.010695225308183145
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
evaluation/rankeval/lm1-2b8-55b-oscarroots_1_lm-eval_global_step52452_2023-02-25-11-18-29_1shots_backup.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.303,
5
- "acc_stderr": 0.014539683710535255
6
- },
7
- "anli_r2": {
8
- "acc": 0.319,
9
- "acc_stderr": 0.014746404865473486
10
- },
11
- "anli_r3": {
12
- "acc": 0.33416666666666667,
13
- "acc_stderr": 0.013622434813136781
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.3270348837209302
19
- },
20
- "copa": {
21
- "acc": 0.66,
22
- "acc_stderr": 0.04760952285695237
23
- }
24
- },
25
- "versions": {
26
- "anli_r1": 0,
27
- "anli_r2": 0,
28
- "anli_r3": 0,
29
- "cb": 1,
30
- "copa": 0
31
- }
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/rankeval/lm1-2b8-55b-oscarroots_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.311,0.014645596385722694,0
3
+ anli_r2,acc,0.346,0.01505026612756444,0
4
+ anli_r3,acc,0.325,0.013526454480351014,0
5
+ arc_challenge,acc,0.24573378839590443,0.012581033453730114,0
6
+ arc_challenge,acc_norm,0.27986348122866894,0.013119040897725922,0
7
+ arc_easy,acc,0.5744949494949495,0.010145271182591021,0
8
+ arc_easy,acc_norm,0.5471380471380471,0.010214087372211392,0
9
+ boolq,acc,0.5629969418960244,0.008675365793227084,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.26927814732692784,,1
12
+ copa,acc,0.68,0.04688261722621505,0
13
+ hellaswag,acc,0.38149770961959767,0.00484761521647344,0
14
+ hellaswag,acc_norm,0.4757020513841864,0.004983886091690525,0
15
+ piqa,acc,0.7154515778019587,0.010527218464130605,0
16
+ piqa,acc_norm,0.7105549510337323,0.01058101474067561,0
17
+ rte,acc,0.5379061371841155,0.03000984891252912,0
18
+ sciq,acc,0.885,0.010093407594904628,0
19
+ sciq,acc_norm,0.88,0.010281328012747384,0
20
+ storycloze_2016,acc,0.6547300908605024,0.010994860223187675,0
21
+ winogrande,acc,0.5272296764009471,0.014031631629827696,0
evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.68,
22
  "acc_stderr": 0.04688261722621505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.68,
22
  "acc_stderr": 0.04688261722621505
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.38149770961959767,
26
+ "acc_stderr": 0.00484761521647344,
27
+ "acc_norm": 0.4757020513841864,
28
+ "acc_norm_stderr": 0.004983886091690525
29
+ },
30
+ "rte": {
31
+ "acc": 0.5379061371841155,
32
+ "acc_stderr": 0.03000984891252912
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5272296764009471,
36
+ "acc_stderr": 0.014031631629827696
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6547300908605024,
40
+ "acc_stderr": 0.010994860223187675
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5629969418960244,
44
+ "acc_stderr": 0.008675365793227084
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5744949494949495,
48
+ "acc_stderr": 0.010145271182591021,
49
+ "acc_norm": 0.5471380471380471,
50
+ "acc_norm_stderr": 0.010214087372211392
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.24573378839590443,
54
+ "acc_stderr": 0.012581033453730114,
55
+ "acc_norm": 0.27986348122866894,
56
+ "acc_norm_stderr": 0.013119040897725922
57
+ },
58
+ "sciq": {
59
+ "acc": 0.885,
60
+ "acc_stderr": 0.010093407594904628,
61
+ "acc_norm": 0.88,
62
+ "acc_norm_stderr": 0.010281328012747384
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7154515778019587,
66
+ "acc_stderr": 0.010527218464130605,
67
+ "acc_norm": 0.7105549510337323,
68
+ "acc_norm_stderr": 0.01058101474067561
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }