Muennighoff commited on
Commit
29393c3
1 Parent(s): cc04ae0
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +58 -0
  2. 4b284b12bc4seed1/evaluation/generation/merged.csv +53 -0
  3. 4b284b12bc4seed1/evaluation/generation/merged.json +1 -0
  4. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0.csv +21 -0
  5. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0_lm-eval_global_step80108_2023-02-25-09-56-03_0shots_backup.json +0 -87
  6. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1.csv +21 -0
  7. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1_lm-eval_global_step80108_2023-02-25-09-56-03_1shots_backup.json +0 -87
  8. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2.csv +21 -0
  9. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2_lm-eval_global_step80108_2023-02-25-09-56-03_2shots_backup.json +0 -87
  10. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3.csv +21 -0
  11. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3_lm-eval_global_step80108_2023-02-25-09-54-24_3shots_backup.json +0 -87
  12. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4.csv +21 -0
  13. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4_lm-eval_global_step80108_2023-02-25-09-56-03_4shots_backup.json +0 -87
  14. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5.csv +21 -0
  15. 4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5_lm-eval_global_step80108_2023-02-25-09-56-03_5shots_backup.json +0 -87
  16. 4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json +1 -0
  17. 4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json +1 -0
  18. 4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
  19. 4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json +1 -0
  20. 4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json +1 -0
  21. 4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json +1 -0
  22. 4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json +1 -0
  23. 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl +3 -0
  24. 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl +3 -0
  25. 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
  26. 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl +3 -0
  27. 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl +3 -0
  28. 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.jsonl +3 -0
  29. 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.jsonl +3 -0
  30. 4b284b12bc4seed2/evaluation/generation/merged.csv +53 -0
  31. 4b284b12bc4seed2/evaluation/generation/merged.json +1 -0
  32. 4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json +133 -0
  33. 4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json +133 -0
  34. 4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
  35. 4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json +133 -0
  36. 4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json +133 -0
  37. 4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json +133 -0
  38. 4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json +133 -0
  39. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0.csv +21 -0
  40. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0_lm-eval_global_step80108_2023-02-24-15-37-27_0shots_backup.json +0 -87
  41. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1.csv +21 -0
  42. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1_lm-eval_global_step80108_2023-02-24-15-37-27_1shots_backup.json +0 -87
  43. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2.csv +21 -0
  44. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2_lm-eval_global_step80108_2023-02-24-15-37-27_2shots_backup.json +0 -87
  45. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3.csv +21 -0
  46. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3_lm-eval_global_step80108_2023-02-24-15-37-27_3shots_backup.json +0 -87
  47. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4.csv +21 -0
  48. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4_lm-eval_global_step80108_2023-02-24-15-37-27_4shots_backup.json +0 -87
  49. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5.csv +21 -0
  50. 4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5_lm-eval_global_step80108_2023-02-24-15-37-27_5shots_backup.json +0 -87
.gitattributes CHANGED
@@ -540,3 +540,61 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
540
  4b284b12bc4seed1/evaluation/generation/examples.4b284b12bc4seed1_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
541
  4b284b12bc4seed1/evaluation/generation/examples.4b284b12bc4seed1_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
542
  4b284b12bc4seed1/evaluation/generation/examples.4b284b12bc4seed1_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  4b284b12bc4seed1/evaluation/generation/examples.4b284b12bc4seed1_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
541
  4b284b12bc4seed1/evaluation/generation/examples.4b284b12bc4seed1_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
542
  4b284b12bc4seed1/evaluation/generation/examples.4b284b12bc4seed1_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
543
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
544
+ 4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
545
+ 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
546
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
547
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
548
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
549
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
550
+ 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
551
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
552
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
553
+ 4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
554
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
555
+ 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
556
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
557
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
558
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
559
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
560
+ 4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
561
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
562
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
563
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
564
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
565
+ 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
566
+ 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
567
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
568
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
569
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
570
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
571
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
572
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
573
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
574
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
575
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
576
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
577
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
578
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
579
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
580
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
581
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
582
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
583
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
584
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
585
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
586
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
587
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
588
+ 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
589
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
590
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
591
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
592
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
593
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
594
+ 4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
595
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
596
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
597
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
598
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
599
+ 4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
600
+ 4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
4b284b12bc4seed1/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0046873222992475675
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0046873222992475675
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.10965942344261605
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.10965942344261605
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.13810992950827863
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.13810992950827863
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.15830106057751542
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.15830106057751542
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.1643064184413658
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1643064184413658
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1672452517241617
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1672452517241617
14
+ e2e_nlg_cleaned,5,average,multiple,0.12371823433219753
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.00907913529407834
16
+ gem_xsum,0,median,rouge2_fmeasure,0.00907913529407834
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.009694349099296177
18
+ gem_xsum,1,median,rouge2_fmeasure,0.009694349099296177
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.013095799671134395
20
+ gem_xsum,2,median,rouge2_fmeasure,0.013095799671134395
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.014219210333358922
22
+ gem_xsum,3,median,rouge2_fmeasure,0.014219210333358922
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.005260596389372223
24
+ gem_xsum,4,median,rouge2_fmeasure,0.005260596389372223
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,4.1835752834372257e-05
26
+ gem_xsum,5,median,rouge2_fmeasure,4.1835752834372257e-05
27
+ gem_xsum,5,average,multiple,0.008565154423345739
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05709876588901533
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05709876588901533
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.056707072109866384
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.056707072109866384
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.057190355206892265
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.057190355206892265
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05729780671036665
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05729780671036665
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05717656346160751
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.05717656346160751
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.056226522227932986
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.056226522227932986
40
+ web_nlg_en,5,average,multiple,0.05694951426761352
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.004231279339262943
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.004231279339262943
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.012970684860542986
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.012970684860542986
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.018558312186827172
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.018558312186827172
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.022809767017555965
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.022809767017555965
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.009630515215905031
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.009630515215905031
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0015852585648270666
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0015852585648270666
53
+ wiki_lingua_en,5,average,multiple,0.011630969530820195
4b284b12bc4seed1/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4137289209321652, "bleu_stderr": 0.03592840698985395, "rouge1_fmeasure": 0.12202957876438714, "rouge1_fmeasure_stderr": 0.0020897173297206215, "rouge1_precision": 0.08052703538584219, "rouge1_precision_stderr": 0.0017032922927436723, "rouge1_recall": 0.34685731924417923, "rouge1_recall_stderr": 0.004696819146262294, "rouge2_fmeasure": 0.05709876588901533, "rouge2_fmeasure_stderr": 0.001338487535015282, "rouge2_precision": 0.0371963393448065, "rouge2_precision_stderr": 0.001000406080573449, "rouge2_recall": 0.16920772861048264, "rouge2_recall_stderr": 0.003352502998815072, "rougeL_fmeasure": 0.11638689693482958, "rougeL_fmeasure_stderr": 0.0018866066835855856, "rougeL_precision": 0.07642927515383495, "rougeL_precision_stderr": 0.0015283993241970184, "rougeL_recall": 0.33488912829275347, "rougeL_recall_stderr": 0.0045599664755635915, "rougeLsum_fmeasure": 0.11524865510935987, "rougeLsum_fmeasure_stderr": 0.001924801119419314, "rougeLsum_precision": 0.07604904600865044, "rougeLsum_precision_stderr": 0.0015809038296551435, "rougeLsum_recall": 0.32815010771859937, "rougeLsum_recall_stderr": 0.004354254520219142}}, "1": {"PALM_prompt": {"bleu": 0.44320235453851886, "bleu_stderr": 0.038654580038729125, "rouge1_fmeasure": 0.12119128966268897, "rouge1_fmeasure_stderr": 0.0020394652462786458, "rouge1_precision": 0.0788292625128184, "rouge1_precision_stderr": 0.001545865630724369, "rouge1_recall": 0.34898005560018197, "rouge1_recall_stderr": 0.004794912547747137, "rouge2_fmeasure": 0.056707072109866384, "rouge2_fmeasure_stderr": 0.001298880402066703, "rouge2_precision": 0.03674083737130746, "rouge2_precision_stderr": 0.0009509963850229752, "rouge2_recall": 0.17151623711827377, "rouge2_recall_stderr": 0.0034511631482392565, "rougeL_fmeasure": 0.1158600041349514, "rougeL_fmeasure_stderr": 0.0018669882568564582, "rougeL_precision": 0.07498774748969433, "rougeL_precision_stderr": 0.0013853305220861518, "rougeL_recall": 0.33741306095617796, "rougeL_recall_stderr": 0.004672233537657657, "rougeLsum_fmeasure": 0.11473200282386622, "rougeLsum_fmeasure_stderr": 0.0019120368338142416, "rougeLsum_precision": 0.07464447143195571, "rougeLsum_precision_stderr": 0.001449910621324083, "rougeLsum_recall": 0.3300443408931697, "rougeLsum_recall_stderr": 0.0044396729039993124}}, "2": {"PALM_prompt": {"bleu": 0.4318507139656118, "bleu_stderr": 0.020126652103215823, "rouge1_fmeasure": 0.1211597172909803, "rouge1_fmeasure_stderr": 0.002024707920166121, "rouge1_precision": 0.07846937247932836, "rouge1_precision_stderr": 0.0015076119882704358, "rouge1_recall": 0.3512872090522125, "rouge1_recall_stderr": 0.004843993030116799, "rouge2_fmeasure": 0.057190355206892265, "rouge2_fmeasure_stderr": 0.0012758800698632082, "rouge2_precision": 0.03689332195406738, "rouge2_precision_stderr": 0.000921539572592384, "rouge2_recall": 0.17407998451855428, "rouge2_recall_stderr": 0.003436093615166077, "rougeL_fmeasure": 0.11605588796630185, "rougeL_fmeasure_stderr": 0.0018703477264729452, "rougeL_precision": 0.07486170965273004, "rougeL_precision_stderr": 0.0013700867024842227, "rougeL_recall": 0.3400276434993588, "rougeL_recall_stderr": 0.004727704994934382, "rougeLsum_fmeasure": 0.1146567492990496, "rougeLsum_fmeasure_stderr": 0.0018940503662138908, "rougeLsum_precision": 0.07425974001821288, "rougeLsum_precision_stderr": 0.001410836040797904, "rougeLsum_recall": 0.33236868938870073, "rougeLsum_recall_stderr": 0.004471409789333774}}, "3": {"PALM_prompt": {"bleu": 0.437820931259575, "bleu_stderr": 0.032980906480770865, "rouge1_fmeasure": 0.12105811695552406, "rouge1_fmeasure_stderr": 0.002038435380001679, "rouge1_precision": 0.0785926975391468, "rouge1_precision_stderr": 0.0015293603423737205, "rouge1_recall": 0.3493939393936786, "rouge1_recall_stderr": 0.004787694752084967, "rouge2_fmeasure": 0.05729780671036665, "rouge2_fmeasure_stderr": 0.0012917575350816215, "rouge2_precision": 0.03703345126698836, "rouge2_precision_stderr": 0.0009394498824059976, "rouge2_recall": 0.17399497492711738, "rouge2_recall_stderr": 0.0034376201679732726, "rougeL_fmeasure": 0.11550661899372622, "rougeL_fmeasure_stderr": 0.001880977056199583, "rougeL_precision": 0.07468535679699133, "rougeL_precision_stderr": 0.0013893206714726519, "rougeL_recall": 0.33714313164357695, "rougeL_recall_stderr": 0.004665286047497972, "rougeLsum_fmeasure": 0.11457534646291556, "rougeLsum_fmeasure_stderr": 0.0019112644015919146, "rougeLsum_precision": 0.07441958215060634, "rougeLsum_precision_stderr": 0.0014367820161710976, "rougeLsum_recall": 0.3308931008083881, "rougeLsum_recall_stderr": 0.004444824580452303}}, "4": {"PALM_prompt": {"bleu": 0.44162298156671387, "bleu_stderr": 0.03391466255022238, "rouge1_fmeasure": 0.12061836300623924, "rouge1_fmeasure_stderr": 0.0019968483074851587, "rouge1_precision": 0.07809208129692338, "rouge1_precision_stderr": 0.0014897473758506577, "rouge1_recall": 0.3509704340756107, "rouge1_recall_stderr": 0.004753376988143335, "rouge2_fmeasure": 0.05717656346160751, "rouge2_fmeasure_stderr": 0.001268286668919329, "rouge2_precision": 0.03681990692658017, "rouge2_precision_stderr": 0.000915025627561909, "rouge2_recall": 0.17569168523985457, "rouge2_recall_stderr": 0.003469639579874031, "rougeL_fmeasure": 0.11522040390086466, "rougeL_fmeasure_stderr": 0.0018477157862829916, "rougeL_precision": 0.07430240518062578, "rougeL_precision_stderr": 0.001356662158276597, "rougeL_recall": 0.33848067082117866, "rougeL_recall_stderr": 0.004630272763143682, "rougeLsum_fmeasure": 0.11384094914867832, "rougeLsum_fmeasure_stderr": 0.0018639383740272559, "rougeLsum_precision": 0.0737119127298611, "rougeLsum_precision_stderr": 0.0013930977884493516, "rougeLsum_recall": 0.33180763273757674, "rougeLsum_recall_stderr": 0.0044219813999489994}}, "5": {"PALM_prompt": {"bleu": 0.43584026292753897, "bleu_stderr": 0.03585233394325135, "rouge1_fmeasure": 0.11885306502412998, "rouge1_fmeasure_stderr": 0.0020096665427829846, "rouge1_precision": 0.0771055828854556, "rouge1_precision_stderr": 0.001542950246478603, "rouge1_recall": 0.3485317236937699, "rouge1_recall_stderr": 0.004899363014116322, "rouge2_fmeasure": 0.056226522227932986, "rouge2_fmeasure_stderr": 0.0012937028286243123, "rouge2_precision": 0.03635257403197826, "rouge2_precision_stderr": 0.0009726184426252793, "rouge2_recall": 0.17491992097726822, "rouge2_recall_stderr": 0.003638732362333741, "rougeL_fmeasure": 0.11305367539046324, "rougeL_fmeasure_stderr": 0.0018447783930060325, "rougeL_precision": 0.07302888437056451, "rougeL_precision_stderr": 0.0013873752417405871, "rougeL_recall": 0.3349645978185685, "rougeL_recall_stderr": 0.004754512380009764, "rougeLsum_fmeasure": 0.11195050334054779, "rougeLsum_fmeasure_stderr": 0.001866391249607523, "rougeLsum_precision": 0.07261259826097624, "rougeLsum_precision_stderr": 0.001427137555756464, "rougeLsum_recall": 0.3288098925316197, "rougeLsum_recall_stderr": 0.004531291754719013}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.049986883599927444, "bleu_stderr": 0.007715281553787714, "rouge1_fmeasure": 0.09138645256779954, "rouge1_fmeasure_stderr": 0.001016337895874407, "rouge1_precision": 0.082701062500984, "rouge1_precision_stderr": 0.0011114690924738965, "rouge1_recall": 0.12060845181501477, "rouge1_recall_stderr": 0.001294419304833145, "rouge2_fmeasure": 0.004231279339262943, "rouge2_fmeasure_stderr": 0.00021788523328651946, "rouge2_precision": 0.004221824477192578, "rouge2_precision_stderr": 0.00022695165818508478, "rouge2_recall": 0.005013826064972669, "rouge2_recall_stderr": 0.0002886749356995342, "rougeL_fmeasure": 0.08450530803702978, "rougeL_fmeasure_stderr": 0.000901554006313133, "rougeL_precision": 0.07584986609826198, "rougeL_precision_stderr": 0.0009625240871993999, "rougeL_recall": 0.11285337234897329, "rougeL_recall_stderr": 0.0012181187217990192, "rougeLsum_fmeasure": 0.08182685699093503, "rougeLsum_fmeasure_stderr": 0.0008824530614132399, "rougeLsum_precision": 0.07385235192113547, "rougeLsum_precision_stderr": 0.0009724143584799478, "rougeLsum_recall": 0.10890947711749421, "rougeLsum_recall_stderr": 0.001166359192980158}}, "1": {"tldr_en": {"bleu": 0.6846935681911303, "bleu_stderr": 0.048520060976790166, "rouge1_fmeasure": 0.1243815766776328, "rouge1_fmeasure_stderr": 0.0015555535464807611, "rouge1_precision": 0.10817533987287839, "rouge1_precision_stderr": 0.001557158831062884, "rouge1_recall": 0.17611373045605921, "rouge1_recall_stderr": 0.0022577581661268067, "rouge2_fmeasure": 0.012970684860542986, "rouge2_fmeasure_stderr": 0.000559978352567754, "rouge2_precision": 0.011131441628753095, "rouge2_precision_stderr": 0.0004968420013755639, "rouge2_recall": 0.01944327988697502, "rouge2_recall_stderr": 0.0009397177797597168, "rougeL_fmeasure": 0.10222393170091604, "rougeL_fmeasure_stderr": 0.001072599197742085, "rougeL_precision": 0.08784703908570092, "rougeL_precision_stderr": 0.0010678467138718758, "rougeL_recall": 0.14780695365208887, "rougeL_recall_stderr": 0.0017330218043806387, "rougeLsum_fmeasure": 0.11629483720068202, "rougeLsum_fmeasure_stderr": 0.0014424904565344696, "rougeLsum_precision": 0.10103160064031973, "rougeLsum_precision_stderr": 0.0014463815800196036, "rougeLsum_recall": 0.16516729505216035, "rougeLsum_recall_stderr": 0.0021165463983442577}}, "2": {"tldr_en": {"bleu": 0.924456967620158, "bleu_stderr": 0.03559680289475058, "rouge1_fmeasure": 0.13555617621093394, "rouge1_fmeasure_stderr": 0.0016918864606359741, "rouge1_precision": 0.11722745375161758, "rouge1_precision_stderr": 0.0016710911856635707, "rouge1_recall": 0.19369245188326673, "rouge1_recall_stderr": 0.0024549618061264467, "rouge2_fmeasure": 0.018558312186827172, "rouge2_fmeasure_stderr": 0.0006667508176160621, "rouge2_precision": 0.015714560769775038, "rouge2_precision_stderr": 0.0005800745400016176, "rouge2_recall": 0.028401953794142447, "rouge2_recall_stderr": 0.0011788887815516438, "rougeL_fmeasure": 0.11035639324221315, "rougeL_fmeasure_stderr": 0.00116330079987278, "rougeL_precision": 0.09440630375692156, "rougeL_precision_stderr": 0.0011423074011562776, "rougeL_recall": 0.16112301920239752, "rougeL_recall_stderr": 0.0019113881872497018, "rougeLsum_fmeasure": 0.12597857991599187, "rougeLsum_fmeasure_stderr": 0.0015636832895239018, "rougeLsum_precision": 0.10875530149178837, "rougeLsum_precision_stderr": 0.001540487913238783, "rougeLsum_recall": 0.18075066689057118, "rougeLsum_recall_stderr": 0.002307538960280241}}, "3": {"tldr_en": {"bleu": 1.3233423451404336, "bleu_stderr": 0.05997841498906398, "rouge1_fmeasure": 0.13144771127478205, "rouge1_fmeasure_stderr": 0.0019737365577617576, "rouge1_precision": 0.11737973030479486, "rouge1_precision_stderr": 0.0020068624954179474, "rouge1_recall": 0.190625289714194, "rouge1_recall_stderr": 0.0029906981571119883, "rouge2_fmeasure": 0.022809767017555965, "rouge2_fmeasure_stderr": 0.0007411221039709219, "rouge2_precision": 0.019855590629909, "rouge2_precision_stderr": 0.0006805588907016732, "rouge2_recall": 0.035136977735741294, "rouge2_recall_stderr": 0.0012979131621517655, "rougeL_fmeasure": 0.10364321400933231, "rougeL_fmeasure_stderr": 0.0014077752246067334, "rougeL_precision": 0.09211895227457069, "rougeL_precision_stderr": 0.001463335123058735, "rougeL_recall": 0.1532140325253042, "rougeL_recall_stderr": 0.0023289864471671166, "rougeLsum_fmeasure": 0.12201579783859283, "rougeLsum_fmeasure_stderr": 0.0018234148242418614, "rougeLsum_precision": 0.10903674502952361, "rougeLsum_precision_stderr": 0.0018618317785407703, "rougeLsum_recall": 0.17719145160862762, "rougeLsum_recall_stderr": 0.002790477962341549}}, "4": {"tldr_en": {"bleu": 0.3613016066707259, "bleu_stderr": 0.036915701681708164, "rouge1_fmeasure": 0.047822320448517586, "rouge1_fmeasure_stderr": 0.0017110762026540103, "rouge1_precision": 0.04521455263428701, "rouge1_precision_stderr": 0.001817051799470601, "rouge1_recall": 0.07167001920005753, "rouge1_recall_stderr": 0.0026297817119641954, "rouge2_fmeasure": 0.009630515215905031, "rouge2_fmeasure_stderr": 0.0005703572936436381, "rouge2_precision": 0.008537546793712486, "rouge2_precision_stderr": 0.0005451212697125958, "rouge2_recall": 0.015535161953539139, "rouge2_recall_stderr": 0.0010082854476503452, "rougeL_fmeasure": 0.037591797438144656, "rougeL_fmeasure_stderr": 0.0012882375080276282, "rougeL_precision": 0.03581845161408539, "rougeL_precision_stderr": 0.001454348764904071, "rougeL_recall": 0.05736959242350815, "rougeL_recall_stderr": 0.0020694778242817827, "rougeLsum_fmeasure": 0.04440192076877557, "rougeLsum_fmeasure_stderr": 0.0015922060897516483, "rougeLsum_precision": 0.04226563254674496, "rougeLsum_precision_stderr": 0.001724425679642684, "rougeLsum_recall": 0.0664662102163325, "rougeLsum_recall_stderr": 0.0024427933119544}}, "5": {"tldr_en": {"bleu": 3.525168786175522e-07, "bleu_stderr": 5.612162101464965e-07, "rouge1_fmeasure": 0.00773346326426295, "rouge1_fmeasure_stderr": 0.0007772828239978309, "rouge1_precision": 0.007307470265602752, "rouge1_precision_stderr": 0.0007897900183615145, "rouge1_recall": 0.011510813177556005, "rouge1_recall_stderr": 0.0011552914746690388, "rouge2_fmeasure": 0.0015852585648270666, "rouge2_fmeasure_stderr": 0.00022806743110909383, "rouge2_precision": 0.001429085231861359, "rouge2_precision_stderr": 0.00021883906441152553, "rouge2_recall": 0.002570423991576332, "rouge2_recall_stderr": 0.00046790313160452587, "rougeL_fmeasure": 0.005728100957946406, "rougeL_fmeasure_stderr": 0.0005516850094907344, "rougeL_precision": 0.005352988837771441, "rougeL_precision_stderr": 0.0005537181118288242, "rougeL_recall": 0.00878014362052647, "rougeL_recall_stderr": 0.0008837201326346826, "rougeLsum_fmeasure": 0.007108428092699879, "rougeLsum_fmeasure_stderr": 0.0007115175993630998, "rougeLsum_precision": 0.006777901778057594, "rougeLsum_precision_stderr": 0.0007374290911612345, "rougeLsum_recall": 0.010579350524900687, "rougeLsum_recall_stderr": 0.0010634321336190457}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.043151087658762945, "bleu_stderr": 0.0032342340375698313, "rouge1_fmeasure": 0.08500629167730897, "rouge1_fmeasure_stderr": 0.0006989135591713463, "rouge1_precision": 0.06495149327905961, "rouge1_precision_stderr": 0.0006154921660754954, "rouge1_recall": 0.13370167404751163, "rouge1_recall_stderr": 0.0010189486890049794, "rouge2_fmeasure": 0.0046873222992475675, "rouge2_fmeasure_stderr": 0.0002022412956316276, "rouge2_precision": 0.0035303832289245198, "rouge2_precision_stderr": 0.0001527456230237433, "rouge2_recall": 0.007319549034551619, "rouge2_recall_stderr": 0.00032488894186680385, "rougeL_fmeasure": 0.08337201239477712, "rougeL_fmeasure_stderr": 0.000651472451774746, "rougeL_precision": 0.06359934650707177, "rougeL_precision_stderr": 0.0005723430187186213, "rougeL_recall": 0.13157140403081988, "rougeL_recall_stderr": 0.0009812226045655013, "rougeLsum_fmeasure": 0.07193972974334535, "rougeLsum_fmeasure_stderr": 0.0005686183231544299, "rougeLsum_precision": 0.05504912687523559, "rougeLsum_precision_stderr": 0.0005196674711591116, "rougeLsum_recall": 0.11355464831908403, "rougeLsum_recall_stderr": 0.0008466893831039181}}, "1": {"generate_text_restaurant": {"bleu": 5.257466769424374, "bleu_stderr": 0.07357547724601604, "rouge1_fmeasure": 0.28148883433290234, "rouge1_fmeasure_stderr": 0.001923209524922163, "rouge1_precision": 0.23786938588769446, "rouge1_precision_stderr": 0.002104681875527956, "rouge1_recall": 0.4121530298560502, "rouge1_recall_stderr": 0.0030089320928001376, "rouge2_fmeasure": 0.10965942344261605, "rouge2_fmeasure_stderr": 0.001266910542618815, "rouge2_precision": 0.09021145181730728, "rouge2_precision_stderr": 0.0012485251931366231, "rouge2_recall": 0.16603227142774246, "rouge2_recall_stderr": 0.002026618225842661, "rougeL_fmeasure": 0.23105487418644294, "rougeL_fmeasure_stderr": 0.0014408012821073104, "rougeL_precision": 0.193582677648821, "rougeL_precision_stderr": 0.001580931677078503, "rougeL_recall": 0.34417212574068223, "rougeL_recall_stderr": 0.0025977918681195948, "rougeLsum_fmeasure": 0.22905909984905182, "rougeLsum_fmeasure_stderr": 0.0017314569916895598, "rougeLsum_precision": 0.19491877857913123, "rougeLsum_precision_stderr": 0.001917333394119044, "rougeLsum_recall": 0.3349539368345205, "rougeLsum_recall_stderr": 0.0027000371649433153}}, "2": {"generate_text_restaurant": {"bleu": 6.5539390861720035, "bleu_stderr": 0.11033075795685604, "rouge1_fmeasure": 0.32505406392907477, "rouge1_fmeasure_stderr": 0.0019673064297673175, "rouge1_precision": 0.2903404594422581, "rouge1_precision_stderr": 0.0024584696959761548, "rouge1_recall": 0.4333135279478913, "rouge1_recall_stderr": 0.002735582379776765, "rouge2_fmeasure": 0.13810992950827863, "rouge2_fmeasure_stderr": 0.001444581219830018, "rouge2_precision": 0.12271979460763294, "rouge2_precision_stderr": 0.0015962080243060197, "rouge2_recall": 0.1876592718803383, "rouge2_recall_stderr": 0.002021534025159006, "rougeL_fmeasure": 0.2534760790754178, "rougeL_fmeasure_stderr": 0.0014777484527935105, "rougeL_precision": 0.22365724489731642, "rougeL_precision_stderr": 0.0018056307126048264, "rougeL_recall": 0.345401861303691, "rougeL_recall_stderr": 0.0024446936487181117, "rougeLsum_fmeasure": 0.2673001651581832, "rougeLsum_fmeasure_stderr": 0.0018598623220801944, "rougeLsum_precision": 0.2395654073443738, "rougeLsum_precision_stderr": 0.00223173839964408, "rougeLsum_recall": 0.35566140454139644, "rougeLsum_recall_stderr": 0.0025565410886410314}}, "3": {"generate_text_restaurant": {"bleu": 7.802031385100445, "bleu_stderr": 0.12520230313336564, "rouge1_fmeasure": 0.3574815420785405, "rouge1_fmeasure_stderr": 0.0020170776348172677, "rouge1_precision": 0.3338557482843276, "rouge1_precision_stderr": 0.002551971432391476, "rouge1_recall": 0.438621126600984, "rouge1_recall_stderr": 0.0026752980677359345, "rouge2_fmeasure": 0.15830106057751542, "rouge2_fmeasure_stderr": 0.001543718964396758, "rouge2_precision": 0.14777645169229991, "rouge2_precision_stderr": 0.001697278469314969, "rouge2_recall": 0.1968122332214139, "rouge2_recall_stderr": 0.0020328947895924005, "rougeL_fmeasure": 0.26474191187216406, "rougeL_fmeasure_stderr": 0.0015211355756810816, "rougeL_precision": 0.24519327081461934, "rougeL_precision_stderr": 0.0018824234987225013, "rougeL_recall": 0.33120052143267326, "rougeL_recall_stderr": 0.002370647416558713, "rougeLsum_fmeasure": 0.29524890844717644, "rougeLsum_fmeasure_stderr": 0.0019147519696928641, "rougeLsum_precision": 0.2764593808821877, "rougeLsum_precision_stderr": 0.002325889733406939, "rougeLsum_recall": 0.36160015573781135, "rougeLsum_recall_stderr": 0.00249846394269229}}, "4": {"generate_text_restaurant": {"bleu": 8.382191502210562, "bleu_stderr": 0.14919541969359718, "rouge1_fmeasure": 0.37167375204035885, "rouge1_fmeasure_stderr": 0.001958486588378226, "rouge1_precision": 0.35487550391649403, "rouge1_precision_stderr": 0.0025344302739034657, "rouge1_recall": 0.4361438966782426, "rouge1_recall_stderr": 0.002544026410687206, "rouge2_fmeasure": 0.1643064184413658, "rouge2_fmeasure_stderr": 0.0015388346414606316, "rouge2_precision": 0.15717469519815827, "rouge2_precision_stderr": 0.0017315413364928619, "rouge2_recall": 0.19553331645690575, "rouge2_recall_stderr": 0.001973839383704276, "rougeL_fmeasure": 0.269902657132463, "rougeL_fmeasure_stderr": 0.001559683698964203, "rougeL_precision": 0.2558302815927228, "rougeL_precision_stderr": 0.0019085151038414955, "rougeL_recall": 0.3217813559039779, "rougeL_recall_stderr": 0.002287943298156401, "rougeLsum_fmeasure": 0.3079430786135033, "rougeLsum_fmeasure_stderr": 0.0018787126708384577, "rougeLsum_precision": 0.29398198190868147, "rougeLsum_precision_stderr": 0.0022912359575795035, "rougeLsum_recall": 0.3619119446078494, "rougeLsum_recall_stderr": 0.0024413151060046338}}, "5": {"generate_text_restaurant": {"bleu": 8.434096593422941, "bleu_stderr": 0.12374952402155545, "rouge1_fmeasure": 0.3784865425574256, "rouge1_fmeasure_stderr": 0.0019280406579030697, "rouge1_precision": 0.3663947022200451, "rouge1_precision_stderr": 0.0025940276035361334, "rouge1_recall": 0.4374444751473449, "rouge1_recall_stderr": 0.002477034708943807, "rouge2_fmeasure": 0.1672452517241617, "rouge2_fmeasure_stderr": 0.0015293774465050592, "rouge2_precision": 0.1621778484306982, "rouge2_precision_stderr": 0.0017550698361701628, "rouge2_recall": 0.19557321714564072, "rouge2_recall_stderr": 0.001918438736692879, "rougeL_fmeasure": 0.27416374665856746, "rougeL_fmeasure_stderr": 0.0015606439952248043, "rougeL_precision": 0.2631916594457156, "rougeL_precision_stderr": 0.0019344484995491348, "rougeL_recall": 0.32194602376614945, "rougeL_recall_stderr": 0.0022606742323408495, "rougeLsum_fmeasure": 0.3146193040515442, "rougeLsum_fmeasure_stderr": 0.0018454927016279012, "rougeLsum_precision": 0.30450332346607173, "rougeLsum_precision_stderr": 0.002324281011297584, "rougeLsum_recall": 0.36375181614652935, "rougeLsum_recall_stderr": 0.002343908164411574}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 0.2362860989119941, "bleu_stderr": 0.06515733012952077, "rouge1_fmeasure": 0.11289013583650749, "rouge1_fmeasure_stderr": 0.0015241960402614591, "rouge1_precision": 0.08342489348341924, "rouge1_precision_stderr": 0.0012842607244636889, "rouge1_recall": 0.18313462458030272, "rouge1_recall_stderr": 0.0022833897865371204, "rouge2_fmeasure": 0.00907913529407834, "rouge2_fmeasure_stderr": 0.0005360706812311225, "rouge2_precision": 0.006807640108939664, "rouge2_precision_stderr": 0.0004546616697105076, "rouge2_recall": 0.014646684932925907, "rouge2_recall_stderr": 0.0008582893632378247, "rougeL_fmeasure": 0.09604887262918436, "rougeL_fmeasure_stderr": 0.0012450372296769945, "rougeL_precision": 0.07075720554200345, "rougeL_precision_stderr": 0.0010438807547277764, "rougeL_recall": 0.1570230345629149, "rougeL_recall_stderr": 0.001966731475785525, "rougeLsum_fmeasure": 0.09369805182267531, "rougeLsum_fmeasure_stderr": 0.0012217813043505867, "rougeLsum_precision": 0.06903402748086229, "rougeLsum_precision_stderr": 0.0010262159629070258, "rougeLsum_recall": 0.15315001526257727, "rougeLsum_recall_stderr": 0.00192242995222989}}, "1": {"article_DOC_summary": {"bleu": 0.37834186143792814, "bleu_stderr": 0.07579632209521951, "rouge1_fmeasure": 0.09612718872859236, "rouge1_fmeasure_stderr": 0.0016940816359247253, "rouge1_precision": 0.0685738400230712, "rouge1_precision_stderr": 0.0012614794489907162, "rouge1_recall": 0.16764066671392627, "rouge1_recall_stderr": 0.0028556162454937625, "rouge2_fmeasure": 0.009694349099296177, "rouge2_fmeasure_stderr": 0.0007117075020582719, "rouge2_precision": 0.00687822694283814, "rouge2_precision_stderr": 0.0005061346255521965, "rouge2_recall": 0.017217988643461746, "rouge2_recall_stderr": 0.001286522520281308, "rougeL_fmeasure": 0.08667789904325363, "rougeL_fmeasure_stderr": 0.0014098828990377955, "rougeL_precision": 0.06179089290304659, "rougeL_precision_stderr": 0.001050543101974293, "rougeL_recall": 0.15154479594716758, "rougeL_recall_stderr": 0.0024146601749356305, "rougeLsum_fmeasure": 0.08238177966767227, "rougeLsum_fmeasure_stderr": 0.0013689020999623559, "rougeLsum_precision": 0.058639097301684553, "rougeLsum_precision_stderr": 0.0010116604269926539, "rougeLsum_recall": 0.14457668621942818, "rougeLsum_recall_stderr": 0.0023919173520257917}}, "2": {"article_DOC_summary": {"bleu": 0.4849143382987017, "bleu_stderr": 0.04905432423972044, "rouge1_fmeasure": 0.10437600774672044, "rouge1_fmeasure_stderr": 0.00207952100569553, "rouge1_precision": 0.07431434923410046, "rouge1_precision_stderr": 0.0015283612791276602, "rouge1_recall": 0.1827918011387138, "rouge1_recall_stderr": 0.003586036642211407, "rouge2_fmeasure": 0.013095799671134395, "rouge2_fmeasure_stderr": 0.0008489058427507967, "rouge2_precision": 0.009246404787084099, "rouge2_precision_stderr": 0.0006013153673346344, "rouge2_recall": 0.02349062031177244, "rouge2_recall_stderr": 0.001548636822631295, "rougeL_fmeasure": 0.09226827959916255, "rougeL_fmeasure_stderr": 0.0016450494129720853, "rougeL_precision": 0.06564873824136315, "rougeL_precision_stderr": 0.0012078804697379189, "rougeL_recall": 0.16191451646036112, "rougeL_recall_stderr": 0.002884321850290769, "rougeLsum_fmeasure": 0.08811439184894636, "rougeLsum_fmeasure_stderr": 0.001657196039152884, "rougeLsum_precision": 0.06260025314838306, "rougeLsum_precision_stderr": 0.0012121969148993826, "rougeLsum_recall": 0.15522785504407458, "rougeLsum_recall_stderr": 0.002921032689214908}}, "3": {"article_DOC_summary": {"bleu": 0.5746056304031278, "bleu_stderr": 0.03416800265114781, "rouge1_fmeasure": 0.10584203948827893, "rouge1_fmeasure_stderr": 0.0022277495525711575, "rouge1_precision": 0.07801541338302878, "rouge1_precision_stderr": 0.0018173640538940942, "rouge1_recall": 0.18154402515522824, "rouge1_recall_stderr": 0.003824785983248113, "rouge2_fmeasure": 0.014219210333358922, "rouge2_fmeasure_stderr": 0.0009311822269523914, "rouge2_precision": 0.01034600818070244, "rouge2_precision_stderr": 0.0007057473078572547, "rouge2_recall": 0.024986150502265235, "rouge2_recall_stderr": 0.001692210317983424, "rougeL_fmeasure": 0.09228122264940566, "rougeL_fmeasure_stderr": 0.001790178384835284, "rougeL_precision": 0.0678536492661622, "rougeL_precision_stderr": 0.0014739423841839488, "rougeL_recall": 0.1585838156145662, "rougeL_recall_stderr": 0.0030943555602493257, "rougeLsum_fmeasure": 0.08945125770061474, "rougeLsum_fmeasure_stderr": 0.0018164527031048951, "rougeLsum_precision": 0.06570229340369058, "rougeLsum_precision_stderr": 0.0014813914622807897, "rougeLsum_recall": 0.1543154445584897, "rougeLsum_recall_stderr": 0.0031794372645773186}}, "4": {"article_DOC_summary": {"bleu": 0.4651237596229011, "bleu_stderr": 0.11156128095981639, "rouge1_fmeasure": 0.03399511479025987, "rouge1_fmeasure_stderr": 0.0020729464765277573, "rouge1_precision": 0.03064279607658244, "rouge1_precision_stderr": 0.0022965807355317283, "rouge1_recall": 0.05195082603025642, "rouge1_recall_stderr": 0.003220539005393162, "rouge2_fmeasure": 0.005260596389372223, "rouge2_fmeasure_stderr": 0.0006214146366789691, "rouge2_precision": 0.004284009012383848, "rouge2_precision_stderr": 0.0005485208167059648, "rouge2_recall": 0.008382344530163326, "rouge2_recall_stderr": 0.001000201966734194, "rougeL_fmeasure": 0.028356576842958853, "rougeL_fmeasure_stderr": 0.0016783975883159549, "rougeL_precision": 0.025526009580683906, "rougeL_precision_stderr": 0.0018894407281977873, "rougeL_recall": 0.04350947253111104, "rougeL_recall_stderr": 0.002619339324653331, "rougeLsum_fmeasure": 0.02809770605845126, "rougeLsum_fmeasure_stderr": 0.0016857742426147392, "rougeLsum_precision": 0.025412397431441656, "rougeLsum_precision_stderr": 0.0019050756306301015, "rougeLsum_recall": 0.043137551559700685, "rougeLsum_recall_stderr": 0.0026330951282367504}}, "5": {"article_DOC_summary": {"bleu": 1.0502876017100983e-40, "bleu_stderr": 9.258629322298703e-36, "rouge1_fmeasure": 0.001878489668702617, "rouge1_fmeasure_stderr": 0.0005292148091180053, "rouge1_precision": 0.0021504887939511347, "rouge1_precision_stderr": 0.0006247191105487891, "rouge1_recall": 0.001777814575528098, "rouge1_recall_stderr": 0.0005039303650046605, "rouge2_fmeasure": 4.1835752834372257e-05, "rouge2_fmeasure_stderr": 4.183575283437121e-05, "rouge2_precision": 5.717552887364208e-05, "rouge2_precision_stderr": 5.7175528873642526e-05, "rouge2_recall": 3.298588204248582e-05, "rouge2_recall_stderr": 3.298588204248464e-05, "rougeL_fmeasure": 0.0016730266584955929, "rougeL_fmeasure_stderr": 0.00046229330330986443, "rougeL_precision": 0.0018915873488676697, "rougeL_precision_stderr": 0.0005354926007169597, "rougeL_recall": 0.0016022233245543466, "rougeL_recall_stderr": 0.00045106425429012806, "rougeLsum_fmeasure": 0.0016122575588416806, "rougeLsum_fmeasure_stderr": 0.00045030733547815735, "rougeLsum_precision": 0.0018136207185854307, "rougeLsum_precision_stderr": 0.0005184093493039095, "rougeLsum_recall": 0.0015526690795628231, "rougeLsum_recall_stderr": 0.0004429788793808031}}}}
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.334,0.014922019523732967,0
3
+ anli_r2,acc,0.325,0.014818724459095527,0
4
+ anli_r3,acc,0.3441666666666667,0.013720551062295756,0
5
+ arc_challenge,acc,0.26023890784982934,0.012821930225112568,0
6
+ arc_challenge,acc_norm,0.2790102389078498,0.01310678488360133,0
7
+ arc_easy,acc,0.5660774410774411,0.010169795770462111,0
8
+ arc_easy,acc_norm,0.5084175084175084,0.010258329515226459,0
9
+ boolq,acc,0.591131498470948,0.008598573693259106,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.1940928270042194,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.46415056761601275,0.0049769393332400776,0
14
+ hellaswag,acc_norm,0.6052579167496515,0.0048779626449918555,0
15
+ piqa,acc,0.7404787812840044,0.01022793988817392,0
16
+ piqa,acc_norm,0.7431991294885746,0.01019286480227804,0
17
+ rte,acc,0.5270758122743683,0.0300523034631437,0
18
+ sciq,acc,0.829,0.011912216456264607,0
19
+ sciq,acc_norm,0.751,0.013681600278702301,0
20
+ storycloze_2016,acc,0.7151256012827365,0.010437513986611718,0
21
+ winogrande,acc,0.5824782951854776,0.013859978264440251,0
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0_lm-eval_global_step80108_2023-02-25-09-56-03_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.334,
5
- "acc_stderr": 0.014922019523732967
6
- },
7
- "anli_r2": {
8
- "acc": 0.325,
9
- "acc_stderr": 0.014818724459095527
10
- },
11
- "anli_r3": {
12
- "acc": 0.3441666666666667,
13
- "acc_stderr": 0.013720551062295756
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.1940928270042194
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.46415056761601275,
26
- "acc_stderr": 0.0049769393332400776,
27
- "acc_norm": 0.6052579167496515,
28
- "acc_norm_stderr": 0.0048779626449918555
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.0300523034631437
33
- },
34
- "winogrande": {
35
- "acc": 0.5824782951854776,
36
- "acc_stderr": 0.013859978264440251
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7151256012827365,
40
- "acc_stderr": 0.010437513986611718
41
- },
42
- "boolq": {
43
- "acc": 0.591131498470948,
44
- "acc_stderr": 0.008598573693259106
45
- },
46
- "arc_easy": {
47
- "acc": 0.5660774410774411,
48
- "acc_stderr": 0.010169795770462111,
49
- "acc_norm": 0.5084175084175084,
50
- "acc_norm_stderr": 0.010258329515226459
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26023890784982934,
54
- "acc_stderr": 0.012821930225112568,
55
- "acc_norm": 0.2790102389078498,
56
- "acc_norm_stderr": 0.01310678488360133
57
- },
58
- "sciq": {
59
- "acc": 0.829,
60
- "acc_stderr": 0.011912216456264607,
61
- "acc_norm": 0.751,
62
- "acc_norm_stderr": 0.013681600278702301
63
- },
64
- "piqa": {
65
- "acc": 0.7404787812840044,
66
- "acc_stderr": 0.01022793988817392,
67
- "acc_norm": 0.7431991294885746,
68
- "acc_norm_stderr": 0.01019286480227804
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.339,0.014976758771620342,0
3
+ anli_r2,acc,0.323,0.014794927843348644,0
4
+ anli_r3,acc,0.3441666666666667,0.013720551062295756,0
5
+ arc_challenge,acc,0.2721843003412969,0.013006600406423706,0
6
+ arc_challenge,acc_norm,0.3037542662116041,0.013438909184778764,0
7
+ arc_easy,acc,0.6056397306397306,0.010028176038393004,0
8
+ arc_easy,acc_norm,0.5606060606060606,0.010184134315437663,0
9
+ boolq,acc,0.5773700305810398,0.008639722698719023,1
10
+ cb,acc,0.5,0.06741998624632421,1
11
+ cb,f1,0.3261261261261261,,1
12
+ copa,acc,0.81,0.03942772444036623,0
13
+ hellaswag,acc,0.4643497311292571,0.004977081808179424,0
14
+ hellaswag,acc_norm,0.6074487153953396,0.004873203269366301,0
15
+ piqa,acc,0.7535364526659413,0.010054810789671824,0
16
+ piqa,acc_norm,0.7595212187159956,0.009971345364651068,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.847,0.01138950045966553,0
19
+ sciq,acc_norm,0.792,0.012841374572096928,0
20
+ storycloze_2016,acc,0.7129877071084981,0.010460934115933261,0
21
+ winogrande,acc,0.5777426992896606,0.013881582030658549,0
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1_lm-eval_global_step80108_2023-02-25-09-56-03_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.339,
5
- "acc_stderr": 0.014976758771620342
6
- },
7
- "anli_r2": {
8
- "acc": 0.323,
9
- "acc_stderr": 0.014794927843348644
10
- },
11
- "anli_r3": {
12
- "acc": 0.3441666666666667,
13
- "acc_stderr": 0.013720551062295756
14
- },
15
- "cb": {
16
- "acc": 0.5,
17
- "acc_stderr": 0.06741998624632421,
18
- "f1": 0.3261261261261261
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.4643497311292571,
26
- "acc_stderr": 0.004977081808179424,
27
- "acc_norm": 0.6074487153953396,
28
- "acc_norm_stderr": 0.004873203269366301
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5777426992896606,
36
- "acc_stderr": 0.013881582030658549
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7129877071084981,
40
- "acc_stderr": 0.010460934115933261
41
- },
42
- "boolq": {
43
- "acc": 0.5773700305810398,
44
- "acc_stderr": 0.008639722698719023
45
- },
46
- "arc_easy": {
47
- "acc": 0.6056397306397306,
48
- "acc_stderr": 0.010028176038393004,
49
- "acc_norm": 0.5606060606060606,
50
- "acc_norm_stderr": 0.010184134315437663
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2721843003412969,
54
- "acc_stderr": 0.013006600406423706,
55
- "acc_norm": 0.3037542662116041,
56
- "acc_norm_stderr": 0.013438909184778764
57
- },
58
- "sciq": {
59
- "acc": 0.847,
60
- "acc_stderr": 0.01138950045966553,
61
- "acc_norm": 0.792,
62
- "acc_norm_stderr": 0.012841374572096928
63
- },
64
- "piqa": {
65
- "acc": 0.7535364526659413,
66
- "acc_stderr": 0.010054810789671824,
67
- "acc_norm": 0.7595212187159956,
68
- "acc_norm_stderr": 0.009971345364651068
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.326,0.014830507204541037,0
3
+ anli_r2,acc,0.337,0.014955087918653607,0
4
+ anli_r3,acc,0.33416666666666667,0.013622434813136774,0
5
+ arc_challenge,acc,0.28071672354948807,0.013131238126975576,0
6
+ arc_challenge,acc_norm,0.3037542662116041,0.013438909184778766,0
7
+ arc_easy,acc,0.5993265993265994,0.010055304474255573,0
8
+ arc_easy,acc_norm,0.5694444444444444,0.010160345396860082,0
9
+ boolq,acc,0.5752293577981651,0.008645503833361106,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.26622479977906655,,1
12
+ copa,acc,0.81,0.039427724440366234,0
13
+ hellaswag,acc,0.4629555865365465,0.004976067726432562,0
14
+ hellaswag,acc_norm,0.609838677554272,0.004867893927258165,0
15
+ piqa,acc,0.7437431991294886,0.01018578783156506,0
16
+ piqa,acc_norm,0.7524483133841132,0.010069703966857116,0
17
+ rte,acc,0.5270758122743683,0.0300523034631437,0
18
+ sciq,acc,0.844,0.011480235006122363,0
19
+ sciq,acc_norm,0.794,0.012795613612786548,0
20
+ storycloze_2016,acc,0.7145911277391769,0.010443395884062115,0
21
+ winogrande,acc,0.5824782951854776,0.013859978264440246,0
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2_lm-eval_global_step80108_2023-02-25-09-56-03_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.326,
5
- "acc_stderr": 0.014830507204541037
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.014955087918653607
10
- },
11
- "anli_r3": {
12
- "acc": 0.33416666666666667,
13
- "acc_stderr": 0.013622434813136774
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.26622479977906655
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.039427724440366234
23
- },
24
- "hellaswag": {
25
- "acc": 0.4629555865365465,
26
- "acc_stderr": 0.004976067726432562,
27
- "acc_norm": 0.609838677554272,
28
- "acc_norm_stderr": 0.004867893927258165
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.0300523034631437
33
- },
34
- "winogrande": {
35
- "acc": 0.5824782951854776,
36
- "acc_stderr": 0.013859978264440246
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7145911277391769,
40
- "acc_stderr": 0.010443395884062115
41
- },
42
- "boolq": {
43
- "acc": 0.5752293577981651,
44
- "acc_stderr": 0.008645503833361106
45
- },
46
- "arc_easy": {
47
- "acc": 0.5993265993265994,
48
- "acc_stderr": 0.010055304474255573,
49
- "acc_norm": 0.5694444444444444,
50
- "acc_norm_stderr": 0.010160345396860082
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28071672354948807,
54
- "acc_stderr": 0.013131238126975576,
55
- "acc_norm": 0.3037542662116041,
56
- "acc_norm_stderr": 0.013438909184778766
57
- },
58
- "sciq": {
59
- "acc": 0.844,
60
- "acc_stderr": 0.011480235006122363,
61
- "acc_norm": 0.794,
62
- "acc_norm_stderr": 0.012795613612786548
63
- },
64
- "piqa": {
65
- "acc": 0.7437431991294886,
66
- "acc_stderr": 0.01018578783156506,
67
- "acc_norm": 0.7524483133841132,
68
- "acc_norm_stderr": 0.010069703966857116
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.316,0.014709193056057127,0
3
+ anli_r2,acc,0.337,0.014955087918653609,0
4
+ anli_r3,acc,0.355,0.0138192490040473,0
5
+ arc_challenge,acc,0.27559726962457337,0.013057169655761841,0
6
+ arc_challenge,acc_norm,0.30204778156996587,0.013417519144716413,0
7
+ arc_easy,acc,0.5896464646464646,0.010093531255765457,0
8
+ arc_easy,acc_norm,0.571969696969697,0.01015294331642626,0
9
+ boolq,acc,0.5831804281345566,0.008623192108843677,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.25805555555555554,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4627564230233021,0.004975919665116542,0
14
+ hellaswag,acc_norm,0.6117307309300936,0.004863603638367434,0
15
+ piqa,acc,0.7480957562568009,0.010128421335088683,0
16
+ piqa,acc_norm,0.7595212187159956,0.009971345364651066,0
17
+ rte,acc,0.5270758122743683,0.0300523034631437,0
18
+ sciq,acc,0.834,0.011772110370812184,0
19
+ sciq,acc_norm,0.793,0.012818553557843986,0
20
+ storycloze_2016,acc,0.711918760021379,0.010472537019822576,0
21
+ winogrande,acc,0.5824782951854776,0.013859978264440251,0
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3_lm-eval_global_step80108_2023-02-25-09-54-24_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.316,
5
- "acc_stderr": 0.014709193056057127
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.014955087918653609
10
- },
11
- "anli_r3": {
12
- "acc": 0.355,
13
- "acc_stderr": 0.0138192490040473
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.25805555555555554
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4627564230233021,
26
- "acc_stderr": 0.004975919665116542,
27
- "acc_norm": 0.6117307309300936,
28
- "acc_norm_stderr": 0.004863603638367434
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.0300523034631437
33
- },
34
- "winogrande": {
35
- "acc": 0.5824782951854776,
36
- "acc_stderr": 0.013859978264440251
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.711918760021379,
40
- "acc_stderr": 0.010472537019822576
41
- },
42
- "boolq": {
43
- "acc": 0.5831804281345566,
44
- "acc_stderr": 0.008623192108843677
45
- },
46
- "arc_easy": {
47
- "acc": 0.5896464646464646,
48
- "acc_stderr": 0.010093531255765457,
49
- "acc_norm": 0.571969696969697,
50
- "acc_norm_stderr": 0.01015294331642626
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27559726962457337,
54
- "acc_stderr": 0.013057169655761841,
55
- "acc_norm": 0.30204778156996587,
56
- "acc_norm_stderr": 0.013417519144716413
57
- },
58
- "sciq": {
59
- "acc": 0.834,
60
- "acc_stderr": 0.011772110370812184,
61
- "acc_norm": 0.793,
62
- "acc_norm_stderr": 0.012818553557843986
63
- },
64
- "piqa": {
65
- "acc": 0.7480957562568009,
66
- "acc_stderr": 0.010128421335088683,
67
- "acc_norm": 0.7595212187159956,
68
- "acc_norm_stderr": 0.009971345364651066
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.323,0.014794927843348633,0
3
+ anli_r2,acc,0.317,0.014721675438880236,0
4
+ anli_r3,acc,0.3625,0.013883037874225516,0
5
+ arc_challenge,acc,0.2790102389078498,0.013106784883601333,0
6
+ arc_challenge,acc_norm,0.30802047781569963,0.013491429517292038,0
7
+ arc_easy,acc,0.5942760942760943,0.010075755540128873,0
8
+ arc_easy,acc_norm,0.5757575757575758,0.010141333654958552,0
9
+ boolq,acc,0.5755351681957187,0.008644688121685498,1
10
+ cb,acc,0.35714285714285715,0.06460957383809221,1
11
+ cb,f1,0.19573820395738203,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4592710615415256,0.004973199296339971,0
14
+ hellaswag,acc_norm,0.6106353316072496,0.00486609688094144,0
15
+ piqa,acc,0.7540805223068553,0.010047331865625194,0
16
+ piqa,acc_norm,0.7589771490750816,0.009979042717267314,0
17
+ rte,acc,0.5126353790613718,0.030086851767188564,0
18
+ sciq,acc,0.835,0.01174363286691616,0
19
+ sciq,acc_norm,0.788,0.01293148186493805,0
20
+ storycloze_2016,acc,0.7194013896312133,0.01038980964728882,0
21
+ winogrande,acc,0.585635359116022,0.013844846232268565,0
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4_lm-eval_global_step80108_2023-02-25-09-56-03_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.323,
5
- "acc_stderr": 0.014794927843348633
6
- },
7
- "anli_r2": {
8
- "acc": 0.317,
9
- "acc_stderr": 0.014721675438880236
10
- },
11
- "anli_r3": {
12
- "acc": 0.3625,
13
- "acc_stderr": 0.013883037874225516
14
- },
15
- "cb": {
16
- "acc": 0.35714285714285715,
17
- "acc_stderr": 0.06460957383809221,
18
- "f1": 0.19573820395738203
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4592710615415256,
26
- "acc_stderr": 0.004973199296339971,
27
- "acc_norm": 0.6106353316072496,
28
- "acc_norm_stderr": 0.00486609688094144
29
- },
30
- "rte": {
31
- "acc": 0.5126353790613718,
32
- "acc_stderr": 0.030086851767188564
33
- },
34
- "winogrande": {
35
- "acc": 0.585635359116022,
36
- "acc_stderr": 0.013844846232268565
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7194013896312133,
40
- "acc_stderr": 0.01038980964728882
41
- },
42
- "boolq": {
43
- "acc": 0.5755351681957187,
44
- "acc_stderr": 0.008644688121685498
45
- },
46
- "arc_easy": {
47
- "acc": 0.5942760942760943,
48
- "acc_stderr": 0.010075755540128873,
49
- "acc_norm": 0.5757575757575758,
50
- "acc_norm_stderr": 0.010141333654958552
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2790102389078498,
54
- "acc_stderr": 0.013106784883601333,
55
- "acc_norm": 0.30802047781569963,
56
- "acc_norm_stderr": 0.013491429517292038
57
- },
58
- "sciq": {
59
- "acc": 0.835,
60
- "acc_stderr": 0.01174363286691616,
61
- "acc_norm": 0.788,
62
- "acc_norm_stderr": 0.01293148186493805
63
- },
64
- "piqa": {
65
- "acc": 0.7540805223068553,
66
- "acc_stderr": 0.010047331865625194,
67
- "acc_norm": 0.7589771490750816,
68
- "acc_norm_stderr": 0.009979042717267314
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.324,0.014806864733738857,0
3
+ anli_r2,acc,0.338,0.014965960710224498,0
4
+ anli_r3,acc,0.3525,0.013797164918918362,0
5
+ arc_challenge,acc,0.2841296928327645,0.013179442447653887,0
6
+ arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0
7
+ arc_easy,acc,0.6043771043771043,0.010033741393430983,0
8
+ arc_easy,acc_norm,0.5749158249158249,0.010143966195717845,0
9
+ boolq,acc,0.5730886850152905,0.008651119069643816,1
10
+ cb,acc,0.42857142857142855,0.06672848092813057,1
11
+ cb,f1,0.25882352941176473,,1
12
+ copa,acc,0.81,0.03942772444036623,0
13
+ hellaswag,acc,0.45907189802828124,0.004973036453863711,0
14
+ hellaswag,acc_norm,0.6099382593108943,0.004867670042866713,0
15
+ piqa,acc,0.7480957562568009,0.010128421335088683,0
16
+ piqa,acc_norm,0.7573449401523396,0.01000200256970869,0
17
+ rte,acc,0.5234657039711191,0.030063300411902652,0
18
+ sciq,acc,0.836,0.011715000693181331,0
19
+ sciq,acc_norm,0.791,0.012864077288499337,0
20
+ storycloze_2016,acc,0.7151256012827365,0.010437513986611718,0
21
+ winogrande,acc,0.5777426992896606,0.013881582030658552,0
4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5_lm-eval_global_step80108_2023-02-25-09-56-03_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.324,
5
- "acc_stderr": 0.014806864733738857
6
- },
7
- "anli_r2": {
8
- "acc": 0.338,
9
- "acc_stderr": 0.014965960710224498
10
- },
11
- "anli_r3": {
12
- "acc": 0.3525,
13
- "acc_stderr": 0.013797164918918362
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813057,
18
- "f1": 0.25882352941176473
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.45907189802828124,
26
- "acc_stderr": 0.004973036453863711,
27
- "acc_norm": 0.6099382593108943,
28
- "acc_norm_stderr": 0.004867670042866713
29
- },
30
- "rte": {
31
- "acc": 0.5234657039711191,
32
- "acc_stderr": 0.030063300411902652
33
- },
34
- "winogrande": {
35
- "acc": 0.5777426992896606,
36
- "acc_stderr": 0.013881582030658552
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7151256012827365,
40
- "acc_stderr": 0.010437513986611718
41
- },
42
- "boolq": {
43
- "acc": 0.5730886850152905,
44
- "acc_stderr": 0.008651119069643816
45
- },
46
- "arc_easy": {
47
- "acc": 0.6043771043771043,
48
- "acc_stderr": 0.010033741393430983,
49
- "acc_norm": 0.5749158249158249,
50
- "acc_norm_stderr": 0.010143966195717845
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2841296928327645,
54
- "acc_stderr": 0.013179442447653887,
55
- "acc_norm": 0.3037542662116041,
56
- "acc_norm_stderr": 0.01343890918477876
57
- },
58
- "sciq": {
59
- "acc": 0.836,
60
- "acc_stderr": 0.011715000693181331,
61
- "acc_norm": 0.791,
62
- "acc_norm_stderr": 0.012864077288499337
63
- },
64
- "piqa": {
65
- "acc": 0.7480957562568009,
66
- "acc_stderr": 0.010128421335088683,
67
- "acc_norm": 0.7573449401523396,
68
- "acc_norm_stderr": 0.01000200256970869
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5083902070671548, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03989468844910082}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07528694149270759, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001485415514756619}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3780318171066232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005223104661769293}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11728631797246779, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019247654134608228}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03516210005523054, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009572816724584412}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18705373069728767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037081482160342696}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05468830386197015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012066786386101694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07002377028639689, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013498282472237842}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3530593907516912, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00490724418597446}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10921255839870517, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017509846379559321}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07051076285503056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014029195285359902}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3526831574759125, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004764676209885756}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10965304015302584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00179658805334725}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.525451157972169, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.020807029076595943}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07578151398935791, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014658497562735056}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3776088572433392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005222009047076881}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11815166842945922, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019539933108443297}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.035216953195913794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008936559179157622}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18552571500246623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003651941896665819}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05505652232011781, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012284406115594482}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07009792254358821, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013082058038069526}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.35054744472531996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004791505363230279}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1094938648119312, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017598889432284988}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07078544810917187, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013654984516325997}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.35187230618740717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004734030010907423}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11026920423657147, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018110860096435387}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.0073229441233485515, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007507591641053894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.01188187354308511, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001193460962416848}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.007782363958910315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007494140926600601}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0014793812141488027, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000229663938081861}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.002825244488953548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005238543235168785}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0016405694963126324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00023525203173163016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.005727289612377416, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005861714348780361}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009446302855839223, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009620615699727216}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.006018804038099202, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005623113681535245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.00679434254221087, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006951873055254002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.011060401872827133, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011163157558010933}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.007193842982164766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006896239402426108}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 5.090669570430738e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 8.489969077479997e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.255556843183474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14306801493502502}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.40133894612714033, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022786254106147374}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4353400028716086, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025934401772713216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.40223710706751803, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019113695305159313}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.17167615815004073, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017373085671162421}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.18892368452661648, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002036271603133117}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17238412451550736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001647066730421544}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2895776384246929, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018377899797485326}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.31561872620267456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002186944465790045}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2905126850575018, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016066395148831222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.33661571162841014, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021532181771208945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.36620748100065265, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002517396529311699}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3376761630795252, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001906552910299397}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.453538794646896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1018770051872085}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.40269265483237937, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022414598319616666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4321840404996293, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002501669112723814}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.40172960710680156, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018233890583034939}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.17489910198304812, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001749578738044224}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.18884294973750174, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001942308180411026}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17413364387783356, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001607923940126345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.29510908654175555, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00188243407356544}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.31785854835047744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021450790852441397}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.29458902141806165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016001785328170475}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3399297127876474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002185077308803786}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.364926897184195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00242487417897827}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3390409817775856, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018684469365198332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.024667231581309436, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016446798164940699}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.04578732040023698, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027949727542311478}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.029257582846789993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017353864694049216}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0025778362583607642, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005426786000843293}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.005056626447538328, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008909529686166358}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0029439626544144923, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00046267375464988764}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.021633378868917415, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014766881089830067}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.039954198597200195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002427151773773013}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.025397669122061574, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001477824062133286}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.021302386042875585, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014719776290891184}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.039097440525313706, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023851152450321484}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.024937374055980524, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014662690411890744}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.27307893333920474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07257669784005619}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0016784285108223424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00046933173483779724}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.001546401819187842, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00043408961008404196}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0015539306240835566, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00042422089610158747}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 5.04489960649783e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 5.044899606497852e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 5.360205831903945e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 5.360205831903959e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 5.19777535214928e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 5.1977753521493134e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0015267311823769563, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00042072964022598367}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.001446281044627963, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000414342893101565}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0014344496420688818, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0003937675400802755}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0015235781201228954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004126564052488673}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0014595891418657935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00041568179279564054}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.001442691159744362, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0003927017017637451}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 8.502508359632142e-38, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.6224422585741e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:523a10be06a013f2bb05811547226fd98dc369df3847b27724be8a1aafea3c94
3
+ size 7826518
4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24c9ffafef83f6d710e4c86cb60ee2c9777154d34e747dfdd445961ea9eb1316
3
+ size 8717380
4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ac756f88a66239f6b8606c2b7f66959a7836f8ea9af0ec79223498c6533292d
3
+ size 34799887
4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7fe94a928fa0803373afccab617f52f5aec27b430fa0af243ba7682a6989937
3
+ size 8394602
4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b722acf4e2608d29748cb8ee7920902003c939595743a4cf0cdf83c583f45b3b
3
+ size 9478188
4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f09666f303c2b90cbeeba1c009c32e5ca8e2e465948115d1c01bd446de824cc7
3
+ size 11672650
4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac678d77590701961941c02a30d1b52c4bb0eb404159f52e5caf1640711028a1
3
+ size 13897546
4b284b12bc4seed2/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0043911029729981465
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0043911029729981465
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.14634556896551848
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.14634556896551848
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.16691425263662168
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.16691425263662168
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.17476964694401387
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.17476964694401387
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.17238412451550736
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.17238412451550736
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.17413364387783356
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.17413364387783356
14
+ e2e_nlg_cleaned,5,average,multiple,0.1398230566520822
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.01153407231838246
16
+ gem_xsum,0,median,rouge2_fmeasure,0.01153407231838246
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.01870363630762986
18
+ gem_xsum,1,median,rouge2_fmeasure,0.01870363630762986
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.02149971321984745
20
+ gem_xsum,2,median,rouge2_fmeasure,0.02149971321984745
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.012510492483954895
22
+ gem_xsum,3,median,rouge2_fmeasure,0.012510492483954895
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0029439626544144923
24
+ gem_xsum,4,median,rouge2_fmeasure,0.0029439626544144923
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,5.19777535214928e-05
26
+ gem_xsum,5,median,rouge2_fmeasure,5.19777535214928e-05
27
+ gem_xsum,5,average,multiple,0.01120730912295844
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05341982709541381
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05341982709541381
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05569920972530597
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.05569920972530597
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.055829123679104704
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.055829123679104704
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.0555802940665526
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.0555802940665526
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05468830386197015
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.05468830386197015
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05505652232011781
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.05505652232011781
40
+ web_nlg_en,5,average,multiple,0.05504554679141084
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.005442298869452033
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.005442298869452033
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.033304794776064954
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.033304794776064954
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.03566526748783912
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.03566526748783912
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.02943630075177706
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.02943630075177706
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.009975083716813172
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.009975083716813172
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0016405694963126324
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0016405694963126324
53
+ wiki_lingua_en,5,average,multiple,0.019244052516376495
4b284b12bc4seed2/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3666975928567752, "bleu_stderr": 0.033942619502247987, "rouge1_fmeasure": 0.11410488927905772, "rouge1_fmeasure_stderr": 0.0021273756435243453, "rouge1_precision": 0.07654378919459422, "rouge1_precision_stderr": 0.0018618261262039134, "rouge1_recall": 0.32609614121370334, "rouge1_recall_stderr": 0.004974989840599945, "rouge2_fmeasure": 0.05341982709541381, "rouge2_fmeasure_stderr": 0.001325737944102095, "rouge2_precision": 0.03603776896987243, "rouge2_precision_stderr": 0.0012493286067619042, "rouge2_recall": 0.15877363093279048, "rouge2_recall_stderr": 0.003408901924533892, "rougeL_fmeasure": 0.10840968841172977, "rougeL_fmeasure_stderr": 0.0019678561218047716, "rougeL_precision": 0.07257143485800877, "rougeL_precision_stderr": 0.0017468394574395602, "rougeL_recall": 0.31289502631185434, "rougeL_recall_stderr": 0.004801473267924515, "rougeLsum_fmeasure": 0.10730895702341088, "rougeLsum_fmeasure_stderr": 0.0019852721665848463, "rougeLsum_precision": 0.0721278567630955, "rougeLsum_precision_stderr": 0.0017765340801950376, "rougeLsum_recall": 0.30700384670165076, "rougeLsum_recall_stderr": 0.004639025717519335}}, "1": {"PALM_prompt": {"bleu": 0.4204000069724605, "bleu_stderr": 0.03454709980239004, "rouge1_fmeasure": 0.11863230545480506, "rouge1_fmeasure_stderr": 0.0020998490379809595, "rouge1_precision": 0.07729381671566446, "rouge1_precision_stderr": 0.0016249952401225709, "rouge1_recall": 0.3463514185710177, "rouge1_recall_stderr": 0.004906324748729579, "rouge2_fmeasure": 0.05569920972530597, "rouge2_fmeasure_stderr": 0.0013333357482266375, "rouge2_precision": 0.036344067746574706, "rouge2_precision_stderr": 0.0010138056526646537, "rouge2_recall": 0.1684509670618413, "rouge2_recall_stderr": 0.0034374849683514908, "rougeL_fmeasure": 0.11211899409488457, "rougeL_fmeasure_stderr": 0.0019248317217534817, "rougeL_precision": 0.0727861600331886, "rougeL_precision_stderr": 0.0014689871591747046, "rougeL_recall": 0.32987484171743015, "rougeL_recall_stderr": 0.004706952933270804, "rougeLsum_fmeasure": 0.11160505052176106, "rougeLsum_fmeasure_stderr": 0.001958165298495008, "rougeLsum_precision": 0.07272118809874989, "rougeLsum_precision_stderr": 0.0015190345258311126, "rougeLsum_recall": 0.3259034212737053, "rougeLsum_recall_stderr": 0.004542325342065352}}, "2": {"PALM_prompt": {"bleu": 0.4748201943710615, "bleu_stderr": 0.030699841465897198, "rouge1_fmeasure": 0.11834242699459707, "rouge1_fmeasure_stderr": 0.0020730822161238323, "rouge1_precision": 0.07667243246356968, "rouge1_precision_stderr": 0.0015739220370015814, "rouge1_recall": 0.35665862094926554, "rouge1_recall_stderr": 0.005095225993954892, "rouge2_fmeasure": 0.055829123679104704, "rouge2_fmeasure_stderr": 0.0013065760171567398, "rouge2_precision": 0.03605855238775504, "rouge2_precision_stderr": 0.0009560579302419175, "rouge2_recall": 0.17612198539506513, "rouge2_recall_stderr": 0.0035618179908472056, "rougeL_fmeasure": 0.11117048364501488, "rougeL_fmeasure_stderr": 0.0018719625581382908, "rougeL_precision": 0.07180378962637364, "rougeL_precision_stderr": 0.0014070124296651453, "rougeL_recall": 0.33774827297948934, "rougeL_recall_stderr": 0.004811147115289192, "rougeLsum_fmeasure": 0.11132148782522473, "rougeLsum_fmeasure_stderr": 0.001919293265591665, "rougeLsum_precision": 0.07214285401823774, "rougeLsum_precision_stderr": 0.0014626628042382018, "rougeLsum_recall": 0.33633909533117384, "rougeLsum_recall_stderr": 0.00472655160720387}}, "3": {"PALM_prompt": {"bleu": 0.5549733758882326, "bleu_stderr": 0.03334480985874615, "rouge1_fmeasure": 0.11965471898211823, "rouge1_fmeasure_stderr": 0.0020402171706270338, "rouge1_precision": 0.07698392131996552, "rouge1_precision_stderr": 0.0015413715512529594, "rouge1_recall": 0.37726780498158974, "rouge1_recall_stderr": 0.005202842583236199, "rouge2_fmeasure": 0.0555802940665526, "rouge2_fmeasure_stderr": 0.0012958506193786643, "rouge2_precision": 0.0357119459728351, "rouge2_precision_stderr": 0.0009447013213754063, "rouge2_recall": 0.18257224954432466, "rouge2_recall_stderr": 0.0036301676144376914, "rougeL_fmeasure": 0.11086203282772523, "rougeL_fmeasure_stderr": 0.0018308098587233419, "rougeL_precision": 0.07108997931335904, "rougeL_precision_stderr": 0.0013561691764805398, "rougeL_recall": 0.3515365565823446, "rougeL_recall_stderr": 0.004828539628946869, "rougeLsum_fmeasure": 0.11168460054725654, "rougeLsum_fmeasure_stderr": 0.0018725810295441223, "rougeLsum_precision": 0.07183721682893512, "rougeLsum_precision_stderr": 0.0014096707081297398, "rougeLsum_recall": 0.3524043706111252, "rougeLsum_recall_stderr": 0.004734089066449836}}, "4": {"PALM_prompt": {"bleu": 0.5083902070671548, "bleu_stderr": 0.03989468844910082, "rouge1_fmeasure": 0.11728631797246779, "rouge1_fmeasure_stderr": 0.0019247654134608228, "rouge1_precision": 0.07528694149270759, "rouge1_precision_stderr": 0.001485415514756619, "rouge1_recall": 0.3780318171066232, "rouge1_recall_stderr": 0.005223104661769293, "rouge2_fmeasure": 0.05468830386197015, "rouge2_fmeasure_stderr": 0.0012066786386101694, "rouge2_precision": 0.03516210005523054, "rouge2_precision_stderr": 0.0009572816724584412, "rouge2_recall": 0.18705373069728767, "rouge2_recall_stderr": 0.0037081482160342696, "rougeL_fmeasure": 0.10921255839870517, "rougeL_fmeasure_stderr": 0.0017509846379559321, "rougeL_precision": 0.07002377028639689, "rougeL_precision_stderr": 0.0013498282472237842, "rougeL_recall": 0.3530593907516912, "rougeL_recall_stderr": 0.00490724418597446, "rougeLsum_fmeasure": 0.10965304015302584, "rougeLsum_fmeasure_stderr": 0.00179658805334725, "rougeLsum_precision": 0.07051076285503056, "rougeLsum_precision_stderr": 0.0014029195285359902, "rougeLsum_recall": 0.3526831574759125, "rougeLsum_recall_stderr": 0.004764676209885756}}, "5": {"PALM_prompt": {"bleu": 0.525451157972169, "bleu_stderr": 0.020807029076595943, "rouge1_fmeasure": 0.11815166842945922, "rouge1_fmeasure_stderr": 0.0019539933108443297, "rouge1_precision": 0.07578151398935791, "rouge1_precision_stderr": 0.0014658497562735056, "rouge1_recall": 0.3776088572433392, "rouge1_recall_stderr": 0.005222009047076881, "rouge2_fmeasure": 0.05505652232011781, "rouge2_fmeasure_stderr": 0.0012284406115594482, "rouge2_precision": 0.035216953195913794, "rouge2_precision_stderr": 0.0008936559179157622, "rouge2_recall": 0.18552571500246623, "rouge2_recall_stderr": 0.003651941896665819, "rougeL_fmeasure": 0.1094938648119312, "rougeL_fmeasure_stderr": 0.0017598889432284988, "rougeL_precision": 0.07009792254358821, "rougeL_precision_stderr": 0.0013082058038069526, "rougeL_recall": 0.35054744472531996, "rougeL_recall_stderr": 0.004791505363230279, "rougeLsum_fmeasure": 0.11026920423657147, "rougeLsum_fmeasure_stderr": 0.0018110860096435387, "rougeLsum_precision": 0.07078544810917187, "rougeLsum_precision_stderr": 0.0013654984516325997, "rougeLsum_recall": 0.35187230618740717, "rougeLsum_recall_stderr": 0.004734030010907423}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.24392437655372481, "bleu_stderr": 0.023857847767670762, "rouge1_fmeasure": 0.09276892306618244, "rouge1_fmeasure_stderr": 0.0011960344018118775, "rouge1_precision": 0.08041542381459962, "rouge1_precision_stderr": 0.0012034347430153701, "rouge1_recall": 0.12977261642720425, "rouge1_recall_stderr": 0.0016061415032672362, "rouge2_fmeasure": 0.005442298869452033, "rouge2_fmeasure_stderr": 0.00031336134550929984, "rouge2_precision": 0.00482101020032747, "rouge2_precision_stderr": 0.000288358623019921, "rouge2_recall": 0.007392323231333513, "rouge2_recall_stderr": 0.0004384293210501124, "rougeL_fmeasure": 0.08081272257942029, "rougeL_fmeasure_stderr": 0.000958641259754323, "rougeL_precision": 0.06948188762406071, "rougeL_precision_stderr": 0.0009562354726599046, "rougeL_recall": 0.1148205615117217, "rougeL_recall_stderr": 0.0013763320189433634, "rougeLsum_fmeasure": 0.08675704545208646, "rougeLsum_fmeasure_stderr": 0.0011000237175663734, "rougeLsum_precision": 0.07505126481568339, "rougeLsum_precision_stderr": 0.0011046552942777078, "rougeLsum_recall": 0.12191849988137807, "rougeLsum_recall_stderr": 0.001502332271175678}}, "1": {"tldr_en": {"bleu": 1.6125876717654175, "bleu_stderr": 0.030611128690029174, "rouge1_fmeasure": 0.17098154370812704, "rouge1_fmeasure_stderr": 0.0019238619176074028, "rouge1_precision": 0.1455337083731935, "rouge1_precision_stderr": 0.0019032086865170478, "rouge1_recall": 0.25012143517213287, "rouge1_recall_stderr": 0.0028277377897345103, "rouge2_fmeasure": 0.033304794776064954, "rouge2_fmeasure_stderr": 0.0008869000313608735, "rouge2_precision": 0.028195165306937067, "rouge2_precision_stderr": 0.0007889557545714422, "rouge2_recall": 0.0503519559667884, "rouge2_recall_stderr": 0.001442734800038838, "rougeL_fmeasure": 0.12805479000603123, "rougeL_fmeasure_stderr": 0.0012729011455461764, "rougeL_precision": 0.10772839012506621, "rougeL_precision_stderr": 0.0012393912404238278, "rougeL_recall": 0.19185007869835716, "rougeL_recall_stderr": 0.002113757458604234, "rougeLsum_fmeasure": 0.15864492369929892, "rougeLsum_fmeasure_stderr": 0.001762860943938729, "rougeLsum_precision": 0.13484649948542737, "rougeLsum_precision_stderr": 0.001744825336362372, "rougeLsum_recall": 0.2327450602622433, "rougeLsum_recall_stderr": 0.002613147650213692}}, "2": {"tldr_en": {"bleu": 1.7772115660325392, "bleu_stderr": 0.047936589027589356, "rouge1_fmeasure": 0.17893721887231873, "rouge1_fmeasure_stderr": 0.0019115838216723414, "rouge1_precision": 0.1521730564438585, "rouge1_precision_stderr": 0.0019244965733381224, "rouge1_recall": 0.2619409970589743, "rouge1_recall_stderr": 0.002792920076539665, "rouge2_fmeasure": 0.03566526748783912, "rouge2_fmeasure_stderr": 0.0008976747682681302, "rouge2_precision": 0.030121700253615085, "rouge2_precision_stderr": 0.0007987546017186038, "rouge2_recall": 0.05440936550544913, "rouge2_recall_stderr": 0.0015261127047882838, "rougeL_fmeasure": 0.1348199640996336, "rougeL_fmeasure_stderr": 0.0012709671031115265, "rougeL_precision": 0.11333400415154432, "rougeL_precision_stderr": 0.001252662085867768, "rougeL_recall": 0.20213029522545098, "rougeL_recall_stderr": 0.0021327420832715108, "rougeLsum_fmeasure": 0.16603969831441404, "rougeLsum_fmeasure_stderr": 0.0017701912652569377, "rougeLsum_precision": 0.14105987920215718, "rougeLsum_precision_stderr": 0.0017803037903608345, "rougeLsum_recall": 0.24387535828634377, "rougeLsum_recall_stderr": 0.002626333657910018}}, "3": {"tldr_en": {"bleu": 1.7384602326131295, "bleu_stderr": 0.07786433868102548, "rouge1_fmeasure": 0.1475192981997226, "rouge1_fmeasure_stderr": 0.0020874234612903364, "rouge1_precision": 0.13124657860364822, "rouge1_precision_stderr": 0.0022184691237030767, "rouge1_recall": 0.21580668554586308, "rouge1_recall_stderr": 0.0031217848837206867, "rouge2_fmeasure": 0.02943630075177706, "rouge2_fmeasure_stderr": 0.0008619263938236904, "rouge2_precision": 0.02556631610400526, "rouge2_precision_stderr": 0.0008289864287315672, "rouge2_recall": 0.04489589142005177, "rouge2_recall_stderr": 0.0014219423335759458, "rougeL_fmeasure": 0.11170409200150849, "rougeL_fmeasure_stderr": 0.001457057228328002, "rougeL_precision": 0.09922448432903473, "rougeL_precision_stderr": 0.0016571195136742573, "rougeL_recall": 0.1671818366386249, "rougeL_recall_stderr": 0.002414244291432632, "rougeLsum_fmeasure": 0.1370205638904663, "rougeLsum_fmeasure_stderr": 0.0019368277108348473, "rougeLsum_precision": 0.12208780038033144, "rougeLsum_precision_stderr": 0.002084036711937265, "rougeLsum_recall": 0.20069686438127185, "rougeLsum_recall_stderr": 0.0029103319320837972}}, "4": {"tldr_en": {"bleu": 0.3960094829116993, "bleu_stderr": 0.03202328755305624, "rouge1_fmeasure": 0.04982811807836869, "rouge1_fmeasure_stderr": 0.0017359012937995398, "rouge1_precision": 0.04553666129964189, "rouge1_precision_stderr": 0.0017147390368758152, "rouge1_recall": 0.07503959025491676, "rouge1_recall_stderr": 0.0026592166476963725, "rouge2_fmeasure": 0.009975083716813172, "rouge2_fmeasure_stderr": 0.0005509539367896302, "rouge2_precision": 0.008610828899341378, "rouge2_precision_stderr": 0.0005000696489982937, "rouge2_recall": 0.01599372920183621, "rouge2_recall_stderr": 0.0009747815347100425, "rougeL_fmeasure": 0.0384196111991066, "rougeL_fmeasure_stderr": 0.0012953226612167558, "rougeL_precision": 0.03503994473270826, "rougeL_precision_stderr": 0.0012923406095347262, "rougeL_recall": 0.059122890511189616, "rougeL_recall_stderr": 0.0020945165554899304, "rougeLsum_fmeasure": 0.04642174084177454, "rougeLsum_fmeasure_stderr": 0.0016156979564290537, "rougeLsum_precision": 0.04241127409619883, "rougeLsum_precision_stderr": 0.0015945350214724905, "rougeLsum_recall": 0.07008648862134945, "rougeLsum_recall_stderr": 0.002485730743546559}}, "5": {"tldr_en": {"bleu": 5.090669570430738e-07, "bleu_stderr": 8.489969077479997e-07, "rouge1_fmeasure": 0.007782363958910315, "rouge1_fmeasure_stderr": 0.0007494140926600601, "rouge1_precision": 0.0073229441233485515, "rouge1_precision_stderr": 0.0007507591641053894, "rouge1_recall": 0.01188187354308511, "rouge1_recall_stderr": 0.001193460962416848, "rouge2_fmeasure": 0.0016405694963126324, "rouge2_fmeasure_stderr": 0.00023525203173163016, "rouge2_precision": 0.0014793812141488027, "rouge2_precision_stderr": 0.000229663938081861, "rouge2_recall": 0.002825244488953548, "rouge2_recall_stderr": 0.0005238543235168785, "rougeL_fmeasure": 0.006018804038099202, "rougeL_fmeasure_stderr": 0.0005623113681535245, "rougeL_precision": 0.005727289612377416, "rougeL_precision_stderr": 0.0005861714348780361, "rougeL_recall": 0.009446302855839223, "rougeL_recall_stderr": 0.0009620615699727216, "rougeLsum_fmeasure": 0.007193842982164766, "rougeLsum_fmeasure_stderr": 0.0006896239402426108, "rougeLsum_precision": 0.00679434254221087, "rougeLsum_precision_stderr": 0.0006951873055254002, "rougeLsum_recall": 0.011060401872827133, "rougeLsum_recall_stderr": 0.0011163157558010933}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.04307253383096468, "bleu_stderr": 0.00615780467434044, "rouge1_fmeasure": 0.046890391429100335, "rouge1_fmeasure_stderr": 0.0007011195091317708, "rouge1_precision": 0.05053375977322034, "rouge1_precision_stderr": 0.0010158112551469237, "rouge1_recall": 0.0652936855655388, "rouge1_recall_stderr": 0.0011167830442822143, "rouge2_fmeasure": 0.0043911029729981465, "rouge2_fmeasure_stderr": 0.00020467393315053172, "rouge2_precision": 0.0035689796753305652, "rouge2_precision_stderr": 0.00021117638207178657, "rouge2_recall": 0.00680143761288298, "rouge2_recall_stderr": 0.0003224069999120447, "rougeL_fmeasure": 0.04657371060940762, "rougeL_fmeasure_stderr": 0.0006904790861998436, "rougeL_precision": 0.05006831349983385, "rougeL_precision_stderr": 0.0009936655556977518, "rougeL_recall": 0.06495084101211217, "rougeL_recall_stderr": 0.0011078411551186476, "rougeLsum_fmeasure": 0.0414614170947359, "rougeLsum_fmeasure_stderr": 0.0006139268554968763, "rougeLsum_precision": 0.046457395636209424, "rougeLsum_precision_stderr": 0.0010026919226726794, "rougeLsum_recall": 0.056636945108678484, "rougeLsum_recall_stderr": 0.0009405550431936621}}, "1": {"generate_text_restaurant": {"bleu": 7.5581936543754376, "bleu_stderr": 0.11126839308169957, "rouge1_fmeasure": 0.3720334093393895, "rouge1_fmeasure_stderr": 0.0021887575462331737, "rouge1_precision": 0.36468914807777286, "rouge1_precision_stderr": 0.0026191701242675273, "rouge1_recall": 0.42454236506789683, "rouge1_recall_stderr": 0.002854215718266759, "rouge2_fmeasure": 0.14634556896551848, "rouge2_fmeasure_stderr": 0.001551058335752227, "rouge2_precision": 0.14368390527185781, "rouge2_precision_stderr": 0.0017338258439567463, "rouge2_recall": 0.16851243140438502, "rouge2_recall_stderr": 0.001912073180376547, "rougeL_fmeasure": 0.2545609931556668, "rougeL_fmeasure_stderr": 0.0016225757478674334, "rougeL_precision": 0.2501530047146125, "rougeL_precision_stderr": 0.0019969846559516353, "rougeL_recall": 0.29284296638863844, "rougeL_recall_stderr": 0.0022356510211168855, "rougeLsum_fmeasure": 0.30559945190747523, "rougeLsum_fmeasure_stderr": 0.002005656004936061, "rougeLsum_precision": 0.30072566914524307, "rougeLsum_precision_stderr": 0.0023876131484858974, "rougeLsum_recall": 0.34812960779874275, "rougeLsum_recall_stderr": 0.0025571739561978655}}, "2": {"generate_text_restaurant": {"bleu": 9.401928562229555, "bleu_stderr": 0.1567763787830054, "rouge1_fmeasure": 0.40026173535365556, "rouge1_fmeasure_stderr": 0.0019715999520502525, "rouge1_precision": 0.40080790096672786, "rouge1_precision_stderr": 0.0023431638323800513, "rouge1_recall": 0.43692976078064044, "rouge1_recall_stderr": 0.0027384926504820336, "rouge2_fmeasure": 0.16691425263662168, "rouge2_fmeasure_stderr": 0.001584309925310028, "rouge2_precision": 0.16690793931954076, "rouge2_precision_stderr": 0.0017426038897506064, "rouge2_recall": 0.1845979813386405, "rouge2_recall_stderr": 0.0019887719670502926, "rougeL_fmeasure": 0.2778592068167227, "rougeL_fmeasure_stderr": 0.0016171214159128506, "rougeL_precision": 0.2785202312030322, "rougeL_precision_stderr": 0.0019011043706789524, "rougeL_recall": 0.3046480183356087, "rougeL_recall_stderr": 0.0022397319337808465, "rougeLsum_fmeasure": 0.3300400794380935, "rougeLsum_fmeasure_stderr": 0.0019018703890833699, "rougeLsum_precision": 0.3309782590686788, "rougeLsum_precision_stderr": 0.002210876758312972, "rougeLsum_recall": 0.3603290477325708, "rougeLsum_recall_stderr": 0.0025365179929270143}}, "3": {"generate_text_restaurant": {"bleu": 10.213394889083318, "bleu_stderr": 0.13997266948295084, "rouge1_fmeasure": 0.40620019916876426, "rouge1_fmeasure_stderr": 0.001889361609626469, "rouge1_precision": 0.40417575321020377, "rouge1_precision_stderr": 0.00227014310665305, "rouge1_recall": 0.44263977445184194, "rouge1_recall_stderr": 0.002603197211967317, "rouge2_fmeasure": 0.17476964694401387, "rouge2_fmeasure_stderr": 0.0016365140191570804, "rouge2_precision": 0.17336100314085984, "rouge2_precision_stderr": 0.001719523491610732, "rouge2_recall": 0.1925950069188434, "rouge2_recall_stderr": 0.002008348010949252, "rougeL_fmeasure": 0.2890898692590789, "rougeL_fmeasure_stderr": 0.0016161064052956943, "rougeL_precision": 0.2876013029701323, "rougeL_precision_stderr": 0.0018718292268598646, "rougeL_recall": 0.31615779730252647, "rougeL_recall_stderr": 0.0021802769594592, "rougeLsum_fmeasure": 0.3406161582563531, "rougeLsum_fmeasure_stderr": 0.001893065321419412, "rougeLsum_precision": 0.3390891522957183, "rougeLsum_precision_stderr": 0.0021744867159723394, "rougeLsum_recall": 0.37119788702454404, "rougeLsum_recall_stderr": 0.0024763950418716834}}, "4": {"generate_text_restaurant": {"bleu": 10.255556843183474, "bleu_stderr": 0.14306801493502502, "rouge1_fmeasure": 0.40223710706751803, "rouge1_fmeasure_stderr": 0.0019113695305159313, "rouge1_precision": 0.40133894612714033, "rouge1_precision_stderr": 0.0022786254106147374, "rouge1_recall": 0.4353400028716086, "rouge1_recall_stderr": 0.0025934401772713216, "rouge2_fmeasure": 0.17238412451550736, "rouge2_fmeasure_stderr": 0.001647066730421544, "rouge2_precision": 0.17167615815004073, "rouge2_precision_stderr": 0.0017373085671162421, "rouge2_recall": 0.18892368452661648, "rouge2_recall_stderr": 0.002036271603133117, "rougeL_fmeasure": 0.2905126850575018, "rougeL_fmeasure_stderr": 0.0016066395148831222, "rougeL_precision": 0.2895776384246929, "rougeL_precision_stderr": 0.0018377899797485326, "rougeL_recall": 0.31561872620267456, "rougeL_recall_stderr": 0.002186944465790045, "rougeLsum_fmeasure": 0.3376761630795252, "rougeLsum_fmeasure_stderr": 0.001906552910299397, "rougeLsum_precision": 0.33661571162841014, "rougeLsum_precision_stderr": 0.0021532181771208945, "rougeLsum_recall": 0.36620748100065265, "rougeLsum_recall_stderr": 0.002517396529311699}}, "5": {"generate_text_restaurant": {"bleu": 10.453538794646896, "bleu_stderr": 0.1018770051872085, "rouge1_fmeasure": 0.40172960710680156, "rouge1_fmeasure_stderr": 0.0018233890583034939, "rouge1_precision": 0.40269265483237937, "rouge1_precision_stderr": 0.0022414598319616666, "rouge1_recall": 0.4321840404996293, "rouge1_recall_stderr": 0.002501669112723814, "rouge2_fmeasure": 0.17413364387783356, "rouge2_fmeasure_stderr": 0.001607923940126345, "rouge2_precision": 0.17489910198304812, "rouge2_precision_stderr": 0.001749578738044224, "rouge2_recall": 0.18884294973750174, "rouge2_recall_stderr": 0.001942308180411026, "rougeL_fmeasure": 0.29458902141806165, "rougeL_fmeasure_stderr": 0.0016001785328170475, "rougeL_precision": 0.29510908654175555, "rougeL_precision_stderr": 0.00188243407356544, "rougeL_recall": 0.31785854835047744, "rougeL_recall_stderr": 0.0021450790852441397, "rougeLsum_fmeasure": 0.3390409817775856, "rougeLsum_fmeasure_stderr": 0.0018684469365198332, "rougeLsum_precision": 0.3399297127876474, "rougeLsum_precision_stderr": 0.002185077308803786, "rougeLsum_recall": 0.364926897184195, "rougeLsum_recall_stderr": 0.00242487417897827}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 0.4630139659504365, "bleu_stderr": 0.07092608567266476, "rouge1_fmeasure": 0.11690451064634964, "rouge1_fmeasure_stderr": 0.001967787449479756, "rouge1_precision": 0.08488770249277669, "rouge1_precision_stderr": 0.001509209602333853, "rouge1_recall": 0.19813172755834577, "rouge1_recall_stderr": 0.0032474572521749194, "rouge2_fmeasure": 0.01153407231838246, "rouge2_fmeasure_stderr": 0.0008078640211965759, "rouge2_precision": 0.008246438330541757, "rouge2_precision_stderr": 0.0005824935746807674, "rouge2_recall": 0.020228282417241533, "rouge2_recall_stderr": 0.0014323430166151545, "rougeL_fmeasure": 0.09705401381411084, "rougeL_fmeasure_stderr": 0.0014626510822599958, "rougeL_precision": 0.07026327535010932, "rougeL_precision_stderr": 0.0011182356154930892, "rougeL_recall": 0.16554056958926258, "rougeL_recall_stderr": 0.002477178559945266, "rougeLsum_fmeasure": 0.09684924566454005, "rougeLsum_fmeasure_stderr": 0.001564778913537185, "rougeLsum_precision": 0.0700549633031646, "rougeLsum_precision_stderr": 0.0011794948365267052, "rougeLsum_recall": 0.1654358655028746, "rougeLsum_recall_stderr": 0.002682749737328667}}, "1": {"article_DOC_summary": {"bleu": 0.7871536101776422, "bleu_stderr": 0.07322906159695927, "rouge1_fmeasure": 0.1288496551436097, "rouge1_fmeasure_stderr": 0.002308468910100682, "rouge1_precision": 0.09157070008924, "rouge1_precision_stderr": 0.0016830670326091262, "rouge1_recall": 0.22600669268446158, "rouge1_recall_stderr": 0.004011298506451288, "rouge2_fmeasure": 0.01870363630762986, "rouge2_fmeasure_stderr": 0.001060863444959382, "rouge2_precision": 0.013095108182729265, "rouge2_precision_stderr": 0.0007453038554987561, "rouge2_recall": 0.03410537737570771, "rouge2_recall_stderr": 0.001974452124311393, "rougeL_fmeasure": 0.10718165977213731, "rougeL_fmeasure_stderr": 0.0017615520710667569, "rougeL_precision": 0.07608507851006019, "rougeL_precision_stderr": 0.0012823580914152802, "rougeL_recall": 0.1887183804153633, "rougeL_recall_stderr": 0.0031215258317537436, "rougeLsum_fmeasure": 0.10668730515839585, "rougeLsum_fmeasure_stderr": 0.0018825214777256206, "rougeLsum_precision": 0.07570007632515688, "rougeLsum_precision_stderr": 0.0013649234652551023, "rougeLsum_recall": 0.18794434033398466, "rougeLsum_recall_stderr": 0.0033324412278060815}}, "2": {"article_DOC_summary": {"bleu": 0.8336246828929866, "bleu_stderr": 0.054821058679941914, "rouge1_fmeasure": 0.13584092927811475, "rouge1_fmeasure_stderr": 0.0023894507967195068, "rouge1_precision": 0.09606365410995298, "rouge1_precision_stderr": 0.0017289752531892755, "rouge1_recall": 0.24109076789716347, "rouge1_recall_stderr": 0.004249367408997067, "rouge2_fmeasure": 0.02149971321984745, "rouge2_fmeasure_stderr": 0.0011072351066159815, "rouge2_precision": 0.014965724288431435, "rouge2_precision_stderr": 0.0007708705714099285, "rouge2_recall": 0.03961712621671652, "rouge2_recall_stderr": 0.002080367490270864, "rougeL_fmeasure": 0.11064742747597277, "rougeL_fmeasure_stderr": 0.001786202252874528, "rougeL_precision": 0.07816557157605167, "rougeL_precision_stderr": 0.0012876620444083977, "rougeL_recall": 0.1971051484974429, "rougeL_recall_stderr": 0.003253803320603986, "rougeLsum_fmeasure": 0.11284377229419564, "rougeLsum_fmeasure_stderr": 0.0019429709645863045, "rougeLsum_precision": 0.07965005221915179, "rougeLsum_precision_stderr": 0.0013920378474243525, "rougeLsum_recall": 0.20132764801033967, "rougeLsum_recall_stderr": 0.0035528940991881495}}, "3": {"article_DOC_summary": {"bleu": 0.5212759581952935, "bleu_stderr": 0.05480106482512421, "rouge1_fmeasure": 0.10839918090666271, "rouge1_fmeasure_stderr": 0.002134922951671863, "rouge1_precision": 0.07905639335534033, "rouge1_precision_stderr": 0.0016483058479411423, "rouge1_recall": 0.18619242699929947, "rouge1_recall_stderr": 0.003677925190637863, "rouge2_fmeasure": 0.012510492483954895, "rouge2_fmeasure_stderr": 0.0008578414376504718, "rouge2_precision": 0.00897105955837582, "rouge2_precision_stderr": 0.0006141464794685609, "rouge2_recall": 0.0220462775032213, "rouge2_recall_stderr": 0.00156983945630155, "rougeL_fmeasure": 0.09293093435452444, "rougeL_fmeasure_stderr": 0.0017059385683015748, "rougeL_precision": 0.06776336855389609, "rougeL_precision_stderr": 0.0013482428780685152, "rougeL_recall": 0.1599342621350528, "rougeL_recall_stderr": 0.002950049745796865, "rougeLsum_fmeasure": 0.09195844966907195, "rougeLsum_fmeasure_stderr": 0.001763679018061731, "rougeLsum_precision": 0.06698510588871204, "rougeLsum_precision_stderr": 0.0013678776526961345, "rougeLsum_recall": 0.15844453034703956, "rougeLsum_recall_stderr": 0.003088521487230944}}, "4": {"article_DOC_summary": {"bleu": 0.27307893333920474, "bleu_stderr": 0.07257669784005619, "rouge1_fmeasure": 0.029257582846789993, "rouge1_fmeasure_stderr": 0.0017353864694049216, "rouge1_precision": 0.024667231581309436, "rouge1_precision_stderr": 0.0016446798164940699, "rouge1_recall": 0.04578732040023698, "rouge1_recall_stderr": 0.0027949727542311478, "rouge2_fmeasure": 0.0029439626544144923, "rouge2_fmeasure_stderr": 0.00046267375464988764, "rouge2_precision": 0.0025778362583607642, "rouge2_precision_stderr": 0.0005426786000843293, "rouge2_recall": 0.005056626447538328, "rouge2_recall_stderr": 0.0008909529686166358, "rougeL_fmeasure": 0.025397669122061574, "rougeL_fmeasure_stderr": 0.001477824062133286, "rougeL_precision": 0.021633378868917415, "rougeL_precision_stderr": 0.0014766881089830067, "rougeL_recall": 0.039954198597200195, "rougeL_recall_stderr": 0.002427151773773013, "rougeLsum_fmeasure": 0.024937374055980524, "rougeLsum_fmeasure_stderr": 0.0014662690411890744, "rougeLsum_precision": 0.021302386042875585, "rougeLsum_precision_stderr": 0.0014719776290891184, "rougeLsum_recall": 0.039097440525313706, "rougeLsum_recall_stderr": 0.0023851152450321484}}, "5": {"article_DOC_summary": {"bleu": 8.502508359632142e-38, "bleu_stderr": 1.6224422585741e-33, "rouge1_fmeasure": 0.0015539306240835566, "rouge1_fmeasure_stderr": 0.00042422089610158747, "rouge1_precision": 0.0016784285108223424, "rouge1_precision_stderr": 0.00046933173483779724, "rouge1_recall": 0.001546401819187842, "rouge1_recall_stderr": 0.00043408961008404196, "rouge2_fmeasure": 5.19777535214928e-05, "rouge2_fmeasure_stderr": 5.1977753521493134e-05, "rouge2_precision": 5.04489960649783e-05, "rouge2_precision_stderr": 5.044899606497852e-05, "rouge2_recall": 5.360205831903945e-05, "rouge2_recall_stderr": 5.360205831903959e-05, "rougeL_fmeasure": 0.0014344496420688818, "rougeL_fmeasure_stderr": 0.0003937675400802755, "rougeL_precision": 0.0015267311823769563, "rougeL_precision_stderr": 0.00042072964022598367, "rougeL_recall": 0.001446281044627963, "rougeL_recall_stderr": 0.000414342893101565, "rougeLsum_fmeasure": 0.001442691159744362, "rougeLsum_fmeasure_stderr": 0.0003927017017637451, "rougeLsum_precision": 0.0015235781201228954, "rougeLsum_precision_stderr": 0.0004126564052488673, "rougeLsum_recall": 0.0014595891418657935, "rougeLsum_recall_stderr": 0.00041568179279564054}}}}
4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.5083902070671548,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.03989468844910082
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.07528694149270759,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.001485415514756619
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.3780318171066232,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.005223104661769293
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.11728631797246779,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0019247654134608228
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.03516210005523054,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0009572816724584412
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.18705373069728767,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0037081482160342696
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.05468830386197015,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0012066786386101694
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.07002377028639689,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0013498282472237842
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.3530593907516912,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.00490724418597446
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.10921255839870517,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0017509846379559321
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.07051076285503056,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0014029195285359902
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.3526831574759125,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004764676209885756
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.10965304015302584,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.00179658805334725
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/web_nlg_en",
5
+ "prompt_name": "PALM_prompt",
6
+ "bleu": 0.525451157972169,
7
+ "dataset_path": "GEM/web_nlg",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "bleu_stderr": 0.020807029076595943
11
+ },
12
+ {
13
+ "task_name": "GEM/web_nlg_en",
14
+ "prompt_name": "PALM_prompt",
15
+ "rouge1_precision": 0.07578151398935791,
16
+ "dataset_path": "GEM/web_nlg",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0014658497562735056
20
+ },
21
+ {
22
+ "task_name": "GEM/web_nlg_en",
23
+ "prompt_name": "PALM_prompt",
24
+ "rouge1_recall": 0.3776088572433392,
25
+ "dataset_path": "GEM/web_nlg",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.005222009047076881
29
+ },
30
+ {
31
+ "task_name": "GEM/web_nlg_en",
32
+ "prompt_name": "PALM_prompt",
33
+ "rouge1_fmeasure": 0.11815166842945922,
34
+ "dataset_path": "GEM/web_nlg",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0019539933108443297
38
+ },
39
+ {
40
+ "task_name": "GEM/web_nlg_en",
41
+ "prompt_name": "PALM_prompt",
42
+ "rouge2_precision": 0.035216953195913794,
43
+ "dataset_path": "GEM/web_nlg",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0008936559179157622
47
+ },
48
+ {
49
+ "task_name": "GEM/web_nlg_en",
50
+ "prompt_name": "PALM_prompt",
51
+ "rouge2_recall": 0.18552571500246623,
52
+ "dataset_path": "GEM/web_nlg",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.003651941896665819
56
+ },
57
+ {
58
+ "task_name": "GEM/web_nlg_en",
59
+ "prompt_name": "PALM_prompt",
60
+ "rouge2_fmeasure": 0.05505652232011781,
61
+ "dataset_path": "GEM/web_nlg",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0012284406115594482
65
+ },
66
+ {
67
+ "task_name": "GEM/web_nlg_en",
68
+ "prompt_name": "PALM_prompt",
69
+ "rougeL_precision": 0.07009792254358821,
70
+ "dataset_path": "GEM/web_nlg",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0013082058038069526
74
+ },
75
+ {
76
+ "task_name": "GEM/web_nlg_en",
77
+ "prompt_name": "PALM_prompt",
78
+ "rougeL_recall": 0.35054744472531996,
79
+ "dataset_path": "GEM/web_nlg",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.004791505363230279
83
+ },
84
+ {
85
+ "task_name": "GEM/web_nlg_en",
86
+ "prompt_name": "PALM_prompt",
87
+ "rougeL_fmeasure": 0.1094938648119312,
88
+ "dataset_path": "GEM/web_nlg",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0017598889432284988
92
+ },
93
+ {
94
+ "task_name": "GEM/web_nlg_en",
95
+ "prompt_name": "PALM_prompt",
96
+ "rougeLsum_precision": 0.07078544810917187,
97
+ "dataset_path": "GEM/web_nlg",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0013654984516325997
101
+ },
102
+ {
103
+ "task_name": "GEM/web_nlg_en",
104
+ "prompt_name": "PALM_prompt",
105
+ "rougeLsum_recall": 0.35187230618740717,
106
+ "dataset_path": "GEM/web_nlg",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.004734030010907423
110
+ },
111
+ {
112
+ "task_name": "GEM/web_nlg_en",
113
+ "prompt_name": "PALM_prompt",
114
+ "rougeLsum_fmeasure": 0.11026920423657147,
115
+ "dataset_path": "GEM/web_nlg",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0018110860096435387
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.0073229441233485515,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0007507591641053894
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.01188187354308511,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.001193460962416848
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.007782363958910315,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0007494140926600601
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.0014793812141488027,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.000229663938081861
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.002825244488953548,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0005238543235168785
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.0016405694963126324,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.00023525203173163016
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.005727289612377416,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0005861714348780361
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.009446302855839223,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0009620615699727216
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.006018804038099202,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0005623113681535245
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.00679434254221087,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0006951873055254002
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.011060401872827133,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0011163157558010933
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.007193842982164766,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0006896239402426108
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 5.090669570430738e-07,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 8.489969077479997e-07
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 10.255556843183474,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.14306801493502502
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.40133894612714033,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0022786254106147374
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.4353400028716086,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0025934401772713216
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.40223710706751803,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0019113695305159313
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.17167615815004073,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0017373085671162421
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.18892368452661648,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.002036271603133117
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.17238412451550736,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.001647066730421544
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.2895776384246929,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0018377899797485326
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.31561872620267456,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.002186944465790045
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.2905126850575018,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0016066395148831222
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.33661571162841014,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0021532181771208945
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.36620748100065265,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.002517396529311699
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.3376761630795252,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.001906552910299397
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 10.453538794646896,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.1018770051872085
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.40269265483237937,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0022414598319616666
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.4321840404996293,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.002501669112723814
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.40172960710680156,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0018233890583034939
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.17489910198304812,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.001749578738044224
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.18884294973750174,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.001942308180411026
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.17413364387783356,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.001607923940126345
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.29510908654175555,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.00188243407356544
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.31785854835047744,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.0021450790852441397
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.29458902141806165,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0016001785328170475
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.3399297127876474,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.002185077308803786
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.364926897184195,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.00242487417897827
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.3390409817775856,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0018684469365198332
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.024667231581309436,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0016446798164940699
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.04578732040023698,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0027949727542311478
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.029257582846789993,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0017353864694049216
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.0025778362583607642,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0005426786000843293
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.005056626447538328,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0008909529686166358
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.0029439626544144923,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.00046267375464988764
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.021633378868917415,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0014766881089830067
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.039954198597200195,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.002427151773773013
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.025397669122061574,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.001477824062133286
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.021302386042875585,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0014719776290891184
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.039097440525313706,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0023851152450321484
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.024937374055980524,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0014662690411890744
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.27307893333920474,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.07257669784005619
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.0016784285108223424,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.00046933173483779724
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.001546401819187842,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.00043408961008404196
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.0015539306240835566,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.00042422089610158747
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 5.04489960649783e-05,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 5.044899606497852e-05
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 5.360205831903945e-05,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 5.360205831903959e-05
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 5.19777535214928e-05,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 5.1977753521493134e-05
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0015267311823769563,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.00042072964022598367
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.001446281044627963,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.000414342893101565
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.0014344496420688818,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0003937675400802755
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.0015235781201228954,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0004126564052488673
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.0014595891418657935,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.00041568179279564054
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.001442691159744362,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0003927017017637451
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 8.502508359632142e-38,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 1.6224422585741e-33
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.342,0.01500870618212173,0
3
+ anli_r2,acc,0.343,0.015019206922356953,0
4
+ anli_r3,acc,0.3491666666666667,0.013767075395077249,0
5
+ arc_challenge,acc,0.2713310580204778,0.012993807727545797,0
6
+ arc_challenge,acc_norm,0.2960750853242321,0.013340916085246268,0
7
+ arc_easy,acc,0.5526094276094277,0.010202832385415644,0
8
+ arc_easy,acc_norm,0.5004208754208754,0.010259779886094424,0
9
+ boolq,acc,0.5770642201834862,0.008640558744656426,1
10
+ cb,acc,0.3392857142857143,0.06384226561930825,1
11
+ cb,f1,0.24338624338624337,,1
12
+ copa,acc,0.78,0.04163331998932261,0
13
+ hellaswag,acc,0.4737104162517427,0.004982879340691403,0
14
+ hellaswag,acc_norm,0.616211909978092,0.004853134271547751,0
15
+ piqa,acc,0.7393906420021763,0.010241826155811625,0
16
+ piqa,acc_norm,0.749183895538629,0.010113869547069046,0
17
+ rte,acc,0.555956678700361,0.029907396333795994,0
18
+ sciq,acc,0.84,0.011598902298689005,0
19
+ sciq,acc_norm,0.758,0.013550631705555954,0
20
+ storycloze_2016,acc,0.7076429716729022,0.010518239729787743,0
21
+ winogrande,acc,0.5943172849250198,0.01380020633601421,0
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0_lm-eval_global_step80108_2023-02-24-15-37-27_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.342,
5
- "acc_stderr": 0.01500870618212173
6
- },
7
- "anli_r2": {
8
- "acc": 0.343,
9
- "acc_stderr": 0.015019206922356953
10
- },
11
- "anli_r3": {
12
- "acc": 0.3491666666666667,
13
- "acc_stderr": 0.013767075395077249
14
- },
15
- "cb": {
16
- "acc": 0.3392857142857143,
17
- "acc_stderr": 0.06384226561930825,
18
- "f1": 0.24338624338624337
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932261
23
- },
24
- "hellaswag": {
25
- "acc": 0.4737104162517427,
26
- "acc_stderr": 0.004982879340691403,
27
- "acc_norm": 0.616211909978092,
28
- "acc_norm_stderr": 0.004853134271547751
29
- },
30
- "rte": {
31
- "acc": 0.555956678700361,
32
- "acc_stderr": 0.029907396333795994
33
- },
34
- "winogrande": {
35
- "acc": 0.5943172849250198,
36
- "acc_stderr": 0.01380020633601421
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7076429716729022,
40
- "acc_stderr": 0.010518239729787743
41
- },
42
- "boolq": {
43
- "acc": 0.5770642201834862,
44
- "acc_stderr": 0.008640558744656426
45
- },
46
- "arc_easy": {
47
- "acc": 0.5526094276094277,
48
- "acc_stderr": 0.010202832385415644,
49
- "acc_norm": 0.5004208754208754,
50
- "acc_norm_stderr": 0.010259779886094424
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2713310580204778,
54
- "acc_stderr": 0.012993807727545797,
55
- "acc_norm": 0.2960750853242321,
56
- "acc_norm_stderr": 0.013340916085246268
57
- },
58
- "sciq": {
59
- "acc": 0.84,
60
- "acc_stderr": 0.011598902298689005,
61
- "acc_norm": 0.758,
62
- "acc_norm_stderr": 0.013550631705555954
63
- },
64
- "piqa": {
65
- "acc": 0.7393906420021763,
66
- "acc_stderr": 0.010241826155811625,
67
- "acc_norm": 0.749183895538629,
68
- "acc_norm_stderr": 0.010113869547069046
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.34,0.014987482264363937,0
3
+ anli_r2,acc,0.325,0.014818724459095527,0
4
+ anli_r3,acc,0.35583333333333333,0.013826518748493314,0
5
+ arc_challenge,acc,0.2687713310580205,0.012955065963710691,0
6
+ arc_challenge,acc_norm,0.29948805460750855,0.013385021637313574,0
7
+ arc_easy,acc,0.577020202020202,0.010137328382209094,0
8
+ arc_easy,acc_norm,0.5315656565656566,0.010239317603199507,0
9
+ boolq,acc,0.598776758409786,0.008572708337178997,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.40095238095238095,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4733120892252539,0.00498266845211894,0
14
+ hellaswag,acc_norm,0.6218880701055567,0.0048392473326060465,0
15
+ piqa,acc,0.7578890097932536,0.009994371269104381,0
16
+ piqa,acc_norm,0.7633297062023939,0.009916841655042809,0
17
+ rte,acc,0.592057761732852,0.029581952519606193,0
18
+ sciq,acc,0.837,0.011686212712746839,0
19
+ sciq,acc_norm,0.788,0.012931481864938034,0
20
+ storycloze_2016,acc,0.7012292891501871,0.010584692134739969,0
21
+ winogrande,acc,0.580110497237569,0.013870943986310393,0
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1_lm-eval_global_step80108_2023-02-24-15-37-27_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.34,
5
- "acc_stderr": 0.014987482264363937
6
- },
7
- "anli_r2": {
8
- "acc": 0.325,
9
- "acc_stderr": 0.014818724459095527
10
- },
11
- "anli_r3": {
12
- "acc": 0.35583333333333333,
13
- "acc_stderr": 0.013826518748493314
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.40095238095238095
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4733120892252539,
26
- "acc_stderr": 0.00498266845211894,
27
- "acc_norm": 0.6218880701055567,
28
- "acc_norm_stderr": 0.0048392473326060465
29
- },
30
- "rte": {
31
- "acc": 0.592057761732852,
32
- "acc_stderr": 0.029581952519606193
33
- },
34
- "winogrande": {
35
- "acc": 0.580110497237569,
36
- "acc_stderr": 0.013870943986310393
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7012292891501871,
40
- "acc_stderr": 0.010584692134739969
41
- },
42
- "boolq": {
43
- "acc": 0.598776758409786,
44
- "acc_stderr": 0.008572708337178997
45
- },
46
- "arc_easy": {
47
- "acc": 0.577020202020202,
48
- "acc_stderr": 0.010137328382209094,
49
- "acc_norm": 0.5315656565656566,
50
- "acc_norm_stderr": 0.010239317603199507
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2687713310580205,
54
- "acc_stderr": 0.012955065963710691,
55
- "acc_norm": 0.29948805460750855,
56
- "acc_norm_stderr": 0.013385021637313574
57
- },
58
- "sciq": {
59
- "acc": 0.837,
60
- "acc_stderr": 0.011686212712746839,
61
- "acc_norm": 0.788,
62
- "acc_norm_stderr": 0.012931481864938034
63
- },
64
- "piqa": {
65
- "acc": 0.7578890097932536,
66
- "acc_stderr": 0.009994371269104381,
67
- "acc_norm": 0.7633297062023939,
68
- "acc_norm_stderr": 0.009916841655042809
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.342,0.015008706182121728,0
3
+ anli_r2,acc,0.323,0.014794927843348637,0
4
+ anli_r3,acc,0.3358333333333333,0.013639261190932882,0
5
+ arc_challenge,acc,0.2619453924914676,0.012849054826858112,0
6
+ arc_challenge,acc_norm,0.30716723549488056,0.013481034054980943,0
7
+ arc_easy,acc,0.5837542087542088,0.010114819404500867,0
8
+ arc_easy,acc_norm,0.5521885521885522,0.010203742451111525,0
9
+ boolq,acc,0.6,0.008568368985904962,1
10
+ cb,acc,0.375,0.06527912098338669,1
11
+ cb,f1,0.26656990807934206,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4715196176060546,0.004981680090303701,0
14
+ hellaswag,acc_norm,0.6190997809201354,0.004846156699486671,0
15
+ piqa,acc,0.7470076169749728,0.01014288869886246,0
16
+ piqa,acc_norm,0.7573449401523396,0.01000200256970869,0
17
+ rte,acc,0.5523465703971119,0.029931070362939526,0
18
+ sciq,acc,0.846,0.011419913065098708,0
19
+ sciq,acc_norm,0.806,0.012510816141264368,0
20
+ storycloze_2016,acc,0.703901656867985,0.010557307688475123,0
21
+ winogrande,acc,0.5753749013417522,0.013891893150264224,0
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2_lm-eval_global_step80108_2023-02-24-15-37-27_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.342,
5
- "acc_stderr": 0.015008706182121728
6
- },
7
- "anli_r2": {
8
- "acc": 0.323,
9
- "acc_stderr": 0.014794927843348637
10
- },
11
- "anli_r3": {
12
- "acc": 0.3358333333333333,
13
- "acc_stderr": 0.013639261190932882
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.26656990807934206
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4715196176060546,
26
- "acc_stderr": 0.004981680090303701,
27
- "acc_norm": 0.6190997809201354,
28
- "acc_norm_stderr": 0.004846156699486671
29
- },
30
- "rte": {
31
- "acc": 0.5523465703971119,
32
- "acc_stderr": 0.029931070362939526
33
- },
34
- "winogrande": {
35
- "acc": 0.5753749013417522,
36
- "acc_stderr": 0.013891893150264224
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.703901656867985,
40
- "acc_stderr": 0.010557307688475123
41
- },
42
- "boolq": {
43
- "acc": 0.6,
44
- "acc_stderr": 0.008568368985904962
45
- },
46
- "arc_easy": {
47
- "acc": 0.5837542087542088,
48
- "acc_stderr": 0.010114819404500867,
49
- "acc_norm": 0.5521885521885522,
50
- "acc_norm_stderr": 0.010203742451111525
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2619453924914676,
54
- "acc_stderr": 0.012849054826858112,
55
- "acc_norm": 0.30716723549488056,
56
- "acc_norm_stderr": 0.013481034054980943
57
- },
58
- "sciq": {
59
- "acc": 0.846,
60
- "acc_stderr": 0.011419913065098708,
61
- "acc_norm": 0.806,
62
- "acc_norm_stderr": 0.012510816141264368
63
- },
64
- "piqa": {
65
- "acc": 0.7470076169749728,
66
- "acc_stderr": 0.01014288869886246,
67
- "acc_norm": 0.7573449401523396,
68
- "acc_norm_stderr": 0.01000200256970869
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.333,0.014910846164229875,0
3
+ anli_r2,acc,0.331,0.014888272588203936,0
4
+ anli_r3,acc,0.35583333333333333,0.013826518748493324,0
5
+ arc_challenge,acc,0.26791808873720135,0.012942030195136442,0
6
+ arc_challenge,acc_norm,0.31313993174061433,0.013552671543623501,0
7
+ arc_easy,acc,0.5900673400673401,0.010091953527506246,0
8
+ arc_easy,acc_norm,0.5627104377104377,0.01017876842932159,0
9
+ boolq,acc,0.6070336391437309,0.00854233514797057,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.35664983164983166,,1
12
+ copa,acc,0.83,0.03775251680686371,0
13
+ hellaswag,acc,0.47032463652658835,0.004980985384152898,0
14
+ hellaswag,acc_norm,0.6199960167297351,0.004843954338451451,0
15
+ piqa,acc,0.7524483133841132,0.010069703966857108,0
16
+ piqa,acc_norm,0.7616974972796517,0.009940334245876224,0
17
+ rte,acc,0.5740072202166066,0.029764956741777645,0
18
+ sciq,acc,0.853,0.011203415395160331,0
19
+ sciq,acc_norm,0.813,0.01233625482807413,0
20
+ storycloze_2016,acc,0.7055050774986639,0.010540668963800296,0
21
+ winogrande,acc,0.5769534333070244,0.013885055359056472,0
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3_lm-eval_global_step80108_2023-02-24-15-37-27_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.333,
5
- "acc_stderr": 0.014910846164229875
6
- },
7
- "anli_r2": {
8
- "acc": 0.331,
9
- "acc_stderr": 0.014888272588203936
10
- },
11
- "anli_r3": {
12
- "acc": 0.35583333333333333,
13
- "acc_stderr": 0.013826518748493324
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.35664983164983166
19
- },
20
- "copa": {
21
- "acc": 0.83,
22
- "acc_stderr": 0.03775251680686371
23
- },
24
- "hellaswag": {
25
- "acc": 0.47032463652658835,
26
- "acc_stderr": 0.004980985384152898,
27
- "acc_norm": 0.6199960167297351,
28
- "acc_norm_stderr": 0.004843954338451451
29
- },
30
- "rte": {
31
- "acc": 0.5740072202166066,
32
- "acc_stderr": 0.029764956741777645
33
- },
34
- "winogrande": {
35
- "acc": 0.5769534333070244,
36
- "acc_stderr": 0.013885055359056472
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7055050774986639,
40
- "acc_stderr": 0.010540668963800296
41
- },
42
- "boolq": {
43
- "acc": 0.6070336391437309,
44
- "acc_stderr": 0.00854233514797057
45
- },
46
- "arc_easy": {
47
- "acc": 0.5900673400673401,
48
- "acc_stderr": 0.010091953527506246,
49
- "acc_norm": 0.5627104377104377,
50
- "acc_norm_stderr": 0.01017876842932159
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26791808873720135,
54
- "acc_stderr": 0.012942030195136442,
55
- "acc_norm": 0.31313993174061433,
56
- "acc_norm_stderr": 0.013552671543623501
57
- },
58
- "sciq": {
59
- "acc": 0.853,
60
- "acc_stderr": 0.011203415395160331,
61
- "acc_norm": 0.813,
62
- "acc_norm_stderr": 0.01233625482807413
63
- },
64
- "piqa": {
65
- "acc": 0.7524483133841132,
66
- "acc_stderr": 0.010069703966857108,
67
- "acc_norm": 0.7616974972796517,
68
- "acc_norm_stderr": 0.009940334245876224
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.34,0.014987482264363935,0
3
+ anli_r2,acc,0.337,0.014955087918653603,0
4
+ anli_r3,acc,0.35,0.013774667009018552,0
5
+ arc_challenge,acc,0.27474402730375425,0.013044617212771227,0
6
+ arc_challenge,acc_norm,0.30802047781569963,0.01349142951729204,0
7
+ arc_easy,acc,0.593013468013468,0.010080695355466598,0
8
+ arc_easy,acc_norm,0.5568181818181818,0.010193324837773497,0
9
+ boolq,acc,0.5975535168195719,0.008576992126012484,1
10
+ cb,acc,0.4107142857142857,0.06633634150359541,1
11
+ cb,f1,0.29090909090909095,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4698267277434774,0.004980687467486101,0
14
+ hellaswag,acc_norm,0.6165106552479586,0.004852420856631488,0
15
+ piqa,acc,0.7513601741022851,0.010084511234296857,0
16
+ piqa,acc_norm,0.76550598476605,0.009885203143240536,0
17
+ rte,acc,0.5234657039711191,0.03006330041190266,0
18
+ sciq,acc,0.855,0.011139977517890132,0
19
+ sciq,acc_norm,0.814,0.0123107902084128,0
20
+ storycloze_2016,acc,0.7108498129342598,0.010484068799942061,0
21
+ winogrande,acc,0.5864246250986582,0.013840971763195303,0
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4_lm-eval_global_step80108_2023-02-24-15-37-27_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.34,
5
- "acc_stderr": 0.014987482264363935
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.014955087918653603
10
- },
11
- "anli_r3": {
12
- "acc": 0.35,
13
- "acc_stderr": 0.013774667009018552
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.06633634150359541,
18
- "f1": 0.29090909090909095
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4698267277434774,
26
- "acc_stderr": 0.004980687467486101,
27
- "acc_norm": 0.6165106552479586,
28
- "acc_norm_stderr": 0.004852420856631488
29
- },
30
- "rte": {
31
- "acc": 0.5234657039711191,
32
- "acc_stderr": 0.03006330041190266
33
- },
34
- "winogrande": {
35
- "acc": 0.5864246250986582,
36
- "acc_stderr": 0.013840971763195303
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7108498129342598,
40
- "acc_stderr": 0.010484068799942061
41
- },
42
- "boolq": {
43
- "acc": 0.5975535168195719,
44
- "acc_stderr": 0.008576992126012484
45
- },
46
- "arc_easy": {
47
- "acc": 0.593013468013468,
48
- "acc_stderr": 0.010080695355466598,
49
- "acc_norm": 0.5568181818181818,
50
- "acc_norm_stderr": 0.010193324837773497
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27474402730375425,
54
- "acc_stderr": 0.013044617212771227,
55
- "acc_norm": 0.30802047781569963,
56
- "acc_norm_stderr": 0.01349142951729204
57
- },
58
- "sciq": {
59
- "acc": 0.855,
60
- "acc_stderr": 0.011139977517890132,
61
- "acc_norm": 0.814,
62
- "acc_norm_stderr": 0.0123107902084128
63
- },
64
- "piqa": {
65
- "acc": 0.7513601741022851,
66
- "acc_stderr": 0.010084511234296857,
67
- "acc_norm": 0.76550598476605,
68
- "acc_norm_stderr": 0.009885203143240536
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.353,0.015120172605483692,0
3
+ anli_r2,acc,0.315,0.014696631960792503,0
4
+ anli_r3,acc,0.3425,0.013704669762934727,0
5
+ arc_challenge,acc,0.2858361774744027,0.013203196088537369,0
6
+ arc_challenge,acc_norm,0.29948805460750855,0.013385021637313572,0
7
+ arc_easy,acc,0.5921717171717171,0.010083950240041214,0
8
+ arc_easy,acc_norm,0.5580808080808081,0.010190328123071765,0
9
+ boolq,acc,0.6067278287461774,0.00854350553741787,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.32716049382716045,,1
12
+ copa,acc,0.82,0.038612291966536955,0
13
+ hellaswag,acc,0.46703843855805616,0.004978927164792884,0
14
+ hellaswag,acc_norm,0.6155148376817368,0.004854791378657001,0
15
+ piqa,acc,0.749183895538629,0.010113869547069044,0
16
+ piqa,acc_norm,0.7584330794341676,0.009986718001804448,0
17
+ rte,acc,0.5740072202166066,0.02976495674177765,0
18
+ sciq,acc,0.865,0.010811655372416051,0
19
+ sciq,acc_norm,0.834,0.011772110370812189,0
20
+ storycloze_2016,acc,0.7076429716729022,0.01051823972978774,0
21
+ winogrande,acc,0.5643251775848461,0.013935709739615713,0
4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5_lm-eval_global_step80108_2023-02-24-15-37-27_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.353,
5
- "acc_stderr": 0.015120172605483692
6
- },
7
- "anli_r2": {
8
- "acc": 0.315,
9
- "acc_stderr": 0.014696631960792503
10
- },
11
- "anli_r3": {
12
- "acc": 0.3425,
13
- "acc_stderr": 0.013704669762934727
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.32716049382716045
19
- },
20
- "copa": {
21
- "acc": 0.82,
22
- "acc_stderr": 0.038612291966536955
23
- },
24
- "hellaswag": {
25
- "acc": 0.46703843855805616,
26
- "acc_stderr": 0.004978927164792884,
27
- "acc_norm": 0.6155148376817368,
28
- "acc_norm_stderr": 0.004854791378657001
29
- },
30
- "rte": {
31
- "acc": 0.5740072202166066,
32
- "acc_stderr": 0.02976495674177765
33
- },
34
- "winogrande": {
35
- "acc": 0.5643251775848461,
36
- "acc_stderr": 0.013935709739615713
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7076429716729022,
40
- "acc_stderr": 0.01051823972978774
41
- },
42
- "boolq": {
43
- "acc": 0.6067278287461774,
44
- "acc_stderr": 0.00854350553741787
45
- },
46
- "arc_easy": {
47
- "acc": 0.5921717171717171,
48
- "acc_stderr": 0.010083950240041214,
49
- "acc_norm": 0.5580808080808081,
50
- "acc_norm_stderr": 0.010190328123071765
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2858361774744027,
54
- "acc_stderr": 0.013203196088537369,
55
- "acc_norm": 0.29948805460750855,
56
- "acc_norm_stderr": 0.013385021637313572
57
- },
58
- "sciq": {
59
- "acc": 0.865,
60
- "acc_stderr": 0.010811655372416051,
61
- "acc_norm": 0.834,
62
- "acc_norm_stderr": 0.011772110370812189
63
- },
64
- "piqa": {
65
- "acc": 0.749183895538629,
66
- "acc_stderr": 0.010113869547069044,
67
- "acc_norm": 0.7584330794341676,
68
- "acc_norm_stderr": 0.009986718001804448
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }