Muennighoff
commited on
Commit
•
a706068
1
Parent(s):
ea88016
Add files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 8b7178b25b/evaluation/generation/merged.csv +39 -0
- 8b7178b25b/evaluation/generation/merged.json +1 -0
- 8b7178b25b/evaluation/rankeval/8b7178b25b_0.csv +21 -0
- 8b7178b25b/evaluation/rankeval/8b7178b25b_0_lm-eval_global_step84877_2023-01-30-20-00-12_0shots_backup.json +0 -87
- 8b7178b25b/evaluation/rankeval/8b7178b25b_1.csv +21 -0
- 8b7178b25b/evaluation/rankeval/8b7178b25b_1_lm-eval_global_step84877_2023-01-30-20-00-12_1shots_backup.json +0 -87
- 8b7178b25b/evaluation/rankeval/8b7178b25b_2.csv +21 -0
- 8b7178b25b/evaluation/rankeval/8b7178b25b_2_lm-eval_global_step84877_2023-01-30-20-00-12_2shots_backup.json +0 -87
- 8b7178b25b/evaluation/rankeval/8b7178b25b_3.csv +21 -0
- 8b7178b25b/evaluation/rankeval/8b7178b25b_3_lm-eval_global_step84877_2023-01-30-20-00-12_3shots_backup.json +0 -87
- 8b7178b25b/evaluation/rankeval/8b7178b25b_4.csv +21 -0
- 8b7178b25b/evaluation/rankeval/8b7178b25b_4_lm-eval_global_step84877_2023-01-30-20-00-12_4shots_backup.json +0 -87
- 8b7178b25b/evaluation/rankeval/8b7178b25b_5.csv +21 -0
- 8b7178b25b/evaluation/rankeval/8b7178b25b_5_lm-eval_global_step84877_2023-01-30-20-00-12_5shots_backup.json +0 -87
- 8b7178b35b/evaluation/generation/merged.csv +39 -0
- 8b7178b35b/evaluation/generation/merged.json +1 -0
- 8b7178b35b/evaluation/rankeval/8b7178b35b_0.csv +21 -0
- 8b7178b35b/evaluation/rankeval/8b7178b35b_0_lm-eval_global_step84877_2023-01-30-20-00-09_0shots_backup.json +0 -87
- 8b7178b35b/evaluation/rankeval/8b7178b35b_1.csv +21 -0
- 8b7178b35b/evaluation/rankeval/8b7178b35b_1_lm-eval_global_step84877_2023-01-30-20-00-09_1shots_backup.json +0 -87
- 8b7178b35b/evaluation/rankeval/8b7178b35b_2.csv +21 -0
- 8b7178b35b/evaluation/rankeval/8b7178b35b_2_lm-eval_global_step84877_2023-01-30-20-00-09_2shots_backup.json +0 -87
- 8b7178b35b/evaluation/rankeval/8b7178b35b_3.csv +21 -0
- 8b7178b35b/evaluation/rankeval/8b7178b35b_3_lm-eval_global_step84877_2023-01-30-20-00-09_3shots_backup.json +0 -87
- 8b7178b35b/evaluation/rankeval/8b7178b35b_4.csv +21 -0
- 8b7178b35b/evaluation/rankeval/8b7178b35b_4_lm-eval_global_step84877_2023-01-30-20-00-09_4shots_backup.json +0 -87
- 8b7178b35b/evaluation/rankeval/8b7178b35b_5.csv +21 -0
- 8b7178b35b/evaluation/rankeval/8b7178b35b_5_lm-eval_global_step84877_2023-01-30-20-00-09_5shots_backup.json +0 -87
- 8b7178b44b/evaluation/generation/merged.csv +39 -0
- 8b7178b44b/evaluation/generation/merged.json +1 -0
- 8b7178b44b/evaluation/rankeval/8b7178b44b_0.csv +21 -0
- 8b7178b44b/evaluation/rankeval/8b7178b44b_0_lm-eval_global_step84877_2023-01-31-11-38-06_0shots_backup.json +0 -87
- 8b7178b44b/evaluation/rankeval/8b7178b44b_1.csv +21 -0
- 8b7178b44b/evaluation/rankeval/8b7178b44b_1_lm-eval_global_step84877_2023-01-31-11-38-06_1shots_backup.json +0 -87
- 8b7178b44b/evaluation/rankeval/8b7178b44b_2.csv +21 -0
- 8b7178b44b/evaluation/rankeval/8b7178b44b_2_lm-eval_global_step84877_2023-01-31-11-38-06_2shots_backup.json +0 -87
- 8b7178b44b/evaluation/rankeval/8b7178b44b_3.csv +21 -0
- 8b7178b44b/evaluation/rankeval/8b7178b44b_3_lm-eval_global_step84877_2023-01-31-11-38-06_3shots_backup.json +0 -87
- 8b7178b44b/evaluation/rankeval/8b7178b44b_4.csv +21 -0
- 8b7178b44b/evaluation/rankeval/8b7178b44b_4_lm-eval_global_step84877_2023-01-31-11-38-06_4shots_backup.json +0 -87
- 8b7178b44b/evaluation/rankeval/8b7178b44b_5.csv +21 -0
- 8b7178b44b/evaluation/rankeval/8b7178b44b_5.json +22 -1
- 8b7178b44b/evaluation/rankeval/8b7178b44b_5_lm-eval_global_step84877_2023-01-31-11-38-06_5shots_backup.json +0 -66
- 8b7178b58b/evaluation/generation/merged.csv +39 -0
- 8b7178b58b/evaluation/generation/merged.json +1 -0
- 8b7178b58b/evaluation/rankeval/8b7178b58b_0.csv +21 -0
- 8b7178b58b/evaluation/rankeval/8b7178b58b_0_lm-eval_global_step84877_2023-01-31-11-38-06_0shots_backup.json +0 -87
- 8b7178b58b/evaluation/rankeval/8b7178b58b_1.csv +21 -0
- 8b7178b58b/evaluation/rankeval/8b7178b58b_1_lm-eval_global_step84877_2023-01-31-11-38-06_1shots_backup.json +0 -87
- 8b7178b58b/evaluation/rankeval/8b7178b58b_2.csv +21 -0
8b7178b25b/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0015194719382464828
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0015194719382464828
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.18637115332137155
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.18637115332137155
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.21309279835498804
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.21309279835498804
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.22032333042194527
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.22032333042194527
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.21993594978939215
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.21993594978939215
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.21793438022656353
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.21793438022656353
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.17652951400875116
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.056442306230818876
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.056442306230818876
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03598895109620969
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.03598895109620969
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03728951183116744
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.03728951183116744
|
21 |
+
gem_xsum,2,average,multiple,0.04324025638606534
|
22 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04746695901782121
|
23 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.04746695901782121
|
24 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.052665704185835084
|
25 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.052665704185835084
|
26 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05157944416358925
|
27 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.05157944416358925
|
28 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.052270761889885206
|
29 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.052270761889885206
|
30 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05295652860215851
|
31 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.05295652860215851
|
32 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.054266401974880724
|
33 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.054266401974880724
|
34 |
+
web_nlg_en,5,average,multiple,0.051867633305695
|
35 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04047796652620078
|
36 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.04047796652620078
|
37 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.0622366147541512
|
38 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.0622366147541512
|
39 |
+
wiki_lingua_en,1,average,multiple,0.051357290640175995
|
8b7178b25b/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.31606716251794437, "bleu_stderr": 0.02563944351471059, "rouge1_fmeasure": 0.10246108718545202, "rouge1_fmeasure_stderr": 0.0017992092481067788, "rouge1_precision": 0.06664761750887295, "rouge1_precision_stderr": 0.0013732959148702218, "rouge1_recall": 0.2940457093239289, "rouge1_recall_stderr": 0.004555499438691762, "rouge2_fmeasure": 0.04746695901782121, "rouge2_fmeasure_stderr": 0.0011129470648980045, "rouge2_precision": 0.030425492420252335, "rouge2_precision_stderr": 0.0007598510657945697, "rouge2_recall": 0.1409722690208349, "rouge2_recall_stderr": 0.0031027610869315144, "rougeL_fmeasure": 0.09860897831373452, "rougeL_fmeasure_stderr": 0.00170202692564, "rougeL_precision": 0.06400114097684483, "rougeL_precision_stderr": 0.0012983914696295615, "rougeL_recall": 0.2849573873935468, "rougeL_recall_stderr": 0.004452046846530438, "rougeLsum_fmeasure": 0.09784615421581211, "rougeLsum_fmeasure_stderr": 0.001694890791053129, "rougeLsum_precision": 0.06366499205115346, "rougeLsum_precision_stderr": 0.0013067510671992333, "rougeLsum_recall": 0.281020892101209, "rougeLsum_recall_stderr": 0.004305691903165883}}, "1": {"PALM_prompt": {"bleu": 0.4594356029919803, "bleu_stderr": 0.025125558903473632, "rouge1_fmeasure": 0.10986827089930207, "rouge1_fmeasure_stderr": 0.0017427056384300328, "rouge1_precision": 0.07027188536501679, "rouge1_precision_stderr": 0.0012558026173472138, "rouge1_recall": 0.3450765288308058, "rouge1_recall_stderr": 0.005055990718374951, "rouge2_fmeasure": 0.052665704185835084, "rouge2_fmeasure_stderr": 0.001116219363919427, "rouge2_precision": 0.03347891417115085, "rouge2_precision_stderr": 0.0007753123397897254, "rouge2_recall": 0.1749435024767411, "rouge2_recall_stderr": 0.0036147215578007344, "rougeL_fmeasure": 0.10429693361721804, "rougeL_fmeasure_stderr": 0.0016079944655777501, "rougeL_precision": 0.06659999496172114, "rougeL_precision_stderr": 0.001145948530216064, "rougeL_recall": 0.32753689197885233, "rougeL_recall_stderr": 0.004724892115185279, "rougeLsum_fmeasure": 0.10453836123127251, "rougeLsum_fmeasure_stderr": 0.0016341468252465234, "rougeLsum_precision": 0.06687788385741289, "rougeLsum_precision_stderr": 0.0011787909083336645, "rougeLsum_recall": 0.32738351484636163, "rougeLsum_recall_stderr": 0.004663465991453483}}, "2": {"PALM_prompt": {"bleu": 0.4626917691641033, "bleu_stderr": 0.026701626014007735, "rouge1_fmeasure": 0.11037120367033523, "rouge1_fmeasure_stderr": 0.0016020141226316521, "rouge1_precision": 0.07008037252994849, "rouge1_precision_stderr": 0.0012035993637679488, "rouge1_recall": 0.36892015349821927, "rouge1_recall_stderr": 0.0052030339175267275, "rouge2_fmeasure": 0.05157944416358925, "rouge2_fmeasure_stderr": 0.0010362585811456641, "rouge2_precision": 0.032540593106790174, "rouge2_precision_stderr": 0.000731515411120624, "rouge2_recall": 0.18396134575469794, "rouge2_recall_stderr": 0.0036503252463729982, "rougeL_fmeasure": 0.10359385842660361, "rougeL_fmeasure_stderr": 0.001472076271258242, "rougeL_precision": 0.06586167470031352, "rougeL_precision_stderr": 0.0011239315244069698, "rougeL_recall": 0.3440427226978943, "rougeL_recall_stderr": 0.0046759733900977925, "rougeLsum_fmeasure": 0.1053829136834338, "rougeLsum_fmeasure_stderr": 0.0015312528661206545, "rougeLsum_precision": 0.06702892136385862, "rougeLsum_precision_stderr": 0.0011636056187413796, "rougeLsum_recall": 0.3498407885544722, "rougeLsum_recall_stderr": 0.004802606835414834}}, "3": {"PALM_prompt": {"bleu": 0.5229218783075383, "bleu_stderr": 0.03287750689960854, "rouge1_fmeasure": 0.11129898486093194, "rouge1_fmeasure_stderr": 0.001603697213150543, "rouge1_precision": 0.07078968601889724, "rouge1_precision_stderr": 0.0012530189540338403, "rouge1_recall": 0.370854174943212, "rouge1_recall_stderr": 0.005131257813808184, "rouge2_fmeasure": 0.052270761889885206, "rouge2_fmeasure_stderr": 0.00103207142359725, "rouge2_precision": 0.03298499960390668, "rouge2_precision_stderr": 0.0007381370373430346, "rouge2_recall": 0.18675978429729426, "rouge2_recall_stderr": 0.0037180617775293043, "rougeL_fmeasure": 0.10352172121642274, "rougeL_fmeasure_stderr": 0.001462770227679166, "rougeL_precision": 0.06597257056782777, "rougeL_precision_stderr": 0.0011757840654232027, "rougeL_recall": 0.3435597898095353, "rougeL_recall_stderr": 0.004593814760542546, "rougeLsum_fmeasure": 0.10603634362382153, "rougeLsum_fmeasure_stderr": 0.0015192749230585549, "rougeLsum_precision": 0.06757005923050743, "rougeLsum_precision_stderr": 0.001208590145084183, "rougeLsum_recall": 0.35225955402286013, "rougeLsum_recall_stderr": 0.004772649571059099}}, "4": {"PALM_prompt": {"bleu": 0.6153677621128861, "bleu_stderr": 0.06053705735401149, "rouge1_fmeasure": 0.11264103250558938, "rouge1_fmeasure_stderr": 0.00163118942788917, "rouge1_precision": 0.07071127507433782, "rouge1_precision_stderr": 0.0011520192275434789, "rouge1_recall": 0.38147578757395145, "rouge1_recall_stderr": 0.005244875143584364, "rouge2_fmeasure": 0.05295652860215851, "rouge2_fmeasure_stderr": 0.0010479736263979194, "rouge2_precision": 0.032992739334882344, "rouge2_precision_stderr": 0.0007156556105984286, "rouge2_recall": 0.1932194114953707, "rouge2_recall_stderr": 0.00374859289181986, "rougeL_fmeasure": 0.10299927861932978, "rougeL_fmeasure_stderr": 0.0014284397671676585, "rougeL_precision": 0.06471147959433785, "rougeL_precision_stderr": 0.0010212506145839731, "rougeL_recall": 0.3483975351615344, "rougeL_recall_stderr": 0.004606426897749366, "rougeLsum_fmeasure": 0.10684797875052807, "rougeLsum_fmeasure_stderr": 0.0015286400891968755, "rougeLsum_precision": 0.06714267179992058, "rougeLsum_precision_stderr": 0.0010872321667830566, "rougeLsum_recall": 0.3609556696570317, "rougeLsum_recall_stderr": 0.004821987801086811}}, "5": {"PALM_prompt": {"bleu": 0.6522310718826546, "bleu_stderr": 0.0342543434913143, "rouge1_fmeasure": 0.11496831895443499, "rouge1_fmeasure_stderr": 0.0016131648938140647, "rouge1_precision": 0.07224798328936255, "rouge1_precision_stderr": 0.0011956217365912398, "rouge1_recall": 0.3930127103047457, "rouge1_recall_stderr": 0.005342634817712522, "rouge2_fmeasure": 0.054266401974880724, "rouge2_fmeasure_stderr": 0.0010279738093731765, "rouge2_precision": 0.03383327897010777, "rouge2_precision_stderr": 0.0007155324088112528, "rouge2_recall": 0.20017152367590102, "rouge2_recall_stderr": 0.0038194708260410088, "rougeL_fmeasure": 0.10413378847184503, "rougeL_fmeasure_stderr": 0.001404569572763983, "rougeL_precision": 0.06558923187444043, "rougeL_precision_stderr": 0.0010757591438449354, "rougeL_recall": 0.35557332046269224, "rougeL_recall_stderr": 0.0046352766809419025, "rougeLsum_fmeasure": 0.10879985852497424, "rougeLsum_fmeasure_stderr": 0.001514236323280586, "rougeLsum_precision": 0.0684743244702778, "rougeLsum_precision_stderr": 0.0011401249172099674, "rougeLsum_recall": 0.3715362630204789, "rougeLsum_recall_stderr": 0.004921937595664833}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.8695101649478336, "bleu_stderr": 0.0774798723948649, "rouge1_fmeasure": 0.18492809283993106, "rouge1_fmeasure_stderr": 0.0019138467632652185, "rouge1_precision": 0.16049598964045364, "rouge1_precision_stderr": 0.0019462983151563367, "rouge1_recall": 0.2676713790563651, "rouge1_recall_stderr": 0.0028735302834564675, "rouge2_fmeasure": 0.04047796652620078, "rouge2_fmeasure_stderr": 0.0009161392431729087, "rouge2_precision": 0.034697662184252506, "rouge2_precision_stderr": 0.0008205861453711819, "rouge2_recall": 0.06110103740366906, "rouge2_recall_stderr": 0.0015502965185361641, "rougeL_fmeasure": 0.13965276114673036, "rougeL_fmeasure_stderr": 0.0013470236487103265, "rougeL_precision": 0.11976282577213969, "rougeL_precision_stderr": 0.0013342259804847978, "rougeL_recall": 0.20766916878715935, "rougeL_recall_stderr": 0.0023245522996809573, "rougeLsum_fmeasure": 0.17014737831445742, "rougeLsum_fmeasure_stderr": 0.0017471450796099739, "rougeLsum_precision": 0.14751551166205068, "rougeLsum_precision_stderr": 0.001780031941907223, "rougeLsum_recall": 0.24735686937216053, "rougeLsum_recall_stderr": 0.0026833728446082504}}, "1": {"tldr_en": {"bleu": 3.2332562389930515, "bleu_stderr": 0.0799951186296727, "rouge1_fmeasure": 0.23690998702342858, "rouge1_fmeasure_stderr": 0.0019890231968532827, "rouge1_precision": 0.20729964457048256, "rouge1_precision_stderr": 0.002274245844031677, "rouge1_recall": 0.33969139617897154, "rouge1_recall_stderr": 0.002833556197863864, "rouge2_fmeasure": 0.0622366147541512, "rouge2_fmeasure_stderr": 0.0011025942027406593, "rouge2_precision": 0.05460146452130771, "rouge2_precision_stderr": 0.001093799845706477, "rouge2_recall": 0.09225300993047651, "rouge2_recall_stderr": 0.0018248020314785354, "rougeL_fmeasure": 0.16727654774531026, "rougeL_fmeasure_stderr": 0.0013621791164943034, "rougeL_precision": 0.14525073667888475, "rougeL_precision_stderr": 0.0015687845277167008, "rougeL_recall": 0.24628845510672517, "rougeL_recall_stderr": 0.0023082894542053194, "rougeLsum_fmeasure": 0.2230496791129934, "rougeLsum_fmeasure_stderr": 0.0018757377324494942, "rougeLsum_precision": 0.19509659175047353, "rougeLsum_precision_stderr": 0.0021449940715420894, "rougeLsum_recall": 0.3204235648372041, "rougeLsum_recall_stderr": 0.0027128359683571725}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.03454795333510589, "bleu_stderr": 0.010722245919714395, "rouge1_fmeasure": 0.009983083620835689, "rouge1_fmeasure_stderr": 0.0007216517112036717, "rouge1_precision": 0.008330877542768076, "rouge1_precision_stderr": 0.0006789467135202171, "rouge1_recall": 0.01512213229502354, "rouge1_recall_stderr": 0.0010598056749213033, "rouge2_fmeasure": 0.0015194719382464828, "rouge2_fmeasure_stderr": 0.0002186426847142056, "rouge2_precision": 0.0012462152328688286, "rouge2_precision_stderr": 0.0001850740246851996, "rouge2_recall": 0.0021941504736507373, "rouge2_recall_stderr": 0.00031789022057788433, "rougeL_fmeasure": 0.009451610525850281, "rougeL_fmeasure_stderr": 0.0006635438560036569, "rougeL_precision": 0.0076678486076520065, "rougeL_precision_stderr": 0.0005780206165870198, "rougeL_recall": 0.014540667195886358, "rougeL_recall_stderr": 0.0010048165449253396, "rougeLsum_fmeasure": 0.008102826704141256, "rougeLsum_fmeasure_stderr": 0.0005948187054074003, "rougeLsum_precision": 0.006854342833172471, "rougeLsum_precision_stderr": 0.0005808821610184307, "rougeLsum_recall": 0.01222352423290077, "rougeLsum_recall_stderr": 0.0008679529695598302}}, "1": {"generate_text_restaurant": {"bleu": 10.110683183008721, "bleu_stderr": 0.10856580026912326, "rouge1_fmeasure": 0.4261931768500333, "rouge1_fmeasure_stderr": 0.0020338904219177937, "rouge1_precision": 0.4273885051454009, "rouge1_precision_stderr": 0.0023386626945434775, "rouge1_recall": 0.4619168895673238, "rouge1_recall_stderr": 0.002910080537073359, "rouge2_fmeasure": 0.18637115332137155, "rouge2_fmeasure_stderr": 0.0017490252095085136, "rouge2_precision": 0.18621605242973624, "rouge2_precision_stderr": 0.0018420288481743644, "rouge2_recall": 0.20425913452615088, "rouge2_recall_stderr": 0.002171191668790452, "rougeL_fmeasure": 0.30007012360581886, "rougeL_fmeasure_stderr": 0.0017073194851728616, "rougeL_precision": 0.30141579223639636, "rougeL_precision_stderr": 0.0019324093600811055, "rougeL_recall": 0.3256528804455511, "rougeL_recall_stderr": 0.002364609519861813, "rougeLsum_fmeasure": 0.35343380568090416, "rougeLsum_fmeasure_stderr": 0.0020183396806264077, "rougeLsum_precision": 0.3549239657030783, "rougeLsum_precision_stderr": 0.0022624041723470494, "rougeLsum_recall": 0.38280848019973457, "rougeLsum_recall_stderr": 0.002716917275700221}}, "2": {"generate_text_restaurant": {"bleu": 11.76941570816289, "bleu_stderr": 0.15573900245748828, "rouge1_fmeasure": 0.45040818552274897, "rouge1_fmeasure_stderr": 0.001961202454945367, "rouge1_precision": 0.44751016860851195, "rouge1_precision_stderr": 0.002306632595634685, "rouge1_recall": 0.4895112200039268, "rouge1_recall_stderr": 0.0028440252957840422, "rouge2_fmeasure": 0.21309279835498804, "rouge2_fmeasure_stderr": 0.0018084420773651576, "rouge2_precision": 0.21124466241603482, "rouge2_precision_stderr": 0.001920141813437847, "rouge2_recall": 0.23408285111523244, "rouge2_recall_stderr": 0.0022741821628275683, "rougeL_fmeasure": 0.3267435445462492, "rougeL_fmeasure_stderr": 0.0017435336297606837, "rougeL_precision": 0.3250865088115516, "rougeL_precision_stderr": 0.0020090753832284394, "rougeL_recall": 0.35555451083591993, "rougeL_recall_stderr": 0.0024076335160479975, "rougeLsum_fmeasure": 0.37491707249596695, "rougeLsum_fmeasure_stderr": 0.0020146819194002373, "rougeLsum_precision": 0.3727655581276283, "rougeLsum_precision_stderr": 0.002277906383748808, "rougeLsum_recall": 0.40742138628815283, "rougeLsum_recall_stderr": 0.0027125462981769503}}, "3": {"generate_text_restaurant": {"bleu": 12.357057866617781, "bleu_stderr": 0.11622210780282331, "rouge1_fmeasure": 0.4562426842321961, "rouge1_fmeasure_stderr": 0.0019386232886629857, "rouge1_precision": 0.45217337104133665, "rouge1_precision_stderr": 0.002294505554504784, "rouge1_recall": 0.4942792865855869, "rouge1_recall_stderr": 0.002743597811824126, "rouge2_fmeasure": 0.22032333042194527, "rouge2_fmeasure_stderr": 0.001852412446316337, "rouge2_precision": 0.2175811124968712, "rouge2_precision_stderr": 0.0019112105040248436, "rouge2_recall": 0.2409573657644072, "rouge2_recall_stderr": 0.0023044463187969245, "rougeL_fmeasure": 0.3334482257131471, "rougeL_fmeasure_stderr": 0.0017866016624809113, "rougeL_precision": 0.33057426692006453, "rougeL_precision_stderr": 0.0020126164762619228, "rougeL_recall": 0.3619557938558922, "rougeL_recall_stderr": 0.0024177454342242813, "rougeLsum_fmeasure": 0.3816770816558835, "rougeLsum_fmeasure_stderr": 0.002007876237485467, "rougeLsum_precision": 0.37808218718729336, "rougeLsum_precision_stderr": 0.0022431062970265465, "rougeLsum_recall": 0.41390580636859037, "rougeLsum_recall_stderr": 0.0026729564370056775}}, "4": {"generate_text_restaurant": {"bleu": 12.468466502215751, "bleu_stderr": 0.09990648109577639, "rouge1_fmeasure": 0.45529352918386296, "rouge1_fmeasure_stderr": 0.0019729158566691536, "rouge1_precision": 0.45220424528442504, "rouge1_precision_stderr": 0.002337808233031794, "rouge1_recall": 0.4921231264159867, "rouge1_recall_stderr": 0.0027602306430746792, "rouge2_fmeasure": 0.21993594978939215, "rouge2_fmeasure_stderr": 0.001850258705420329, "rouge2_precision": 0.21806635417357498, "rouge2_precision_stderr": 0.0019446195531272607, "rouge2_recall": 0.2401062314536212, "rouge2_recall_stderr": 0.00230192039014965, "rougeL_fmeasure": 0.3320801238250487, "rougeL_fmeasure_stderr": 0.001796774794658876, "rougeL_precision": 0.32979623685018056, "rougeL_precision_stderr": 0.002025955430723721, "rougeL_recall": 0.35968401480469203, "rougeL_recall_stderr": 0.0024104842159546787, "rougeLsum_fmeasure": 0.38073707749695784, "rougeLsum_fmeasure_stderr": 0.0020603619333531575, "rougeLsum_precision": 0.37807324998983866, "rougeLsum_precision_stderr": 0.002305718057682757, "rougeLsum_recall": 0.4118614121463698, "rougeLsum_recall_stderr": 0.0027125157827382973}}, "5": {"generate_text_restaurant": {"bleu": 12.11108163274697, "bleu_stderr": 0.1511086038189987, "rouge1_fmeasure": 0.45448318084710065, "rouge1_fmeasure_stderr": 0.0019502573764627392, "rouge1_precision": 0.4494689335142834, "rouge1_precision_stderr": 0.0023139900407069896, "rouge1_recall": 0.49189724241328564, "rouge1_recall_stderr": 0.002720672072662264, "rouge2_fmeasure": 0.21793438022656353, "rouge2_fmeasure_stderr": 0.0018382910372242867, "rouge2_precision": 0.21502011952369768, "rouge2_precision_stderr": 0.0018993108425412735, "rouge2_recall": 0.23790406650497267, "rouge2_recall_stderr": 0.002257304748049696, "rougeL_fmeasure": 0.33100721144592343, "rougeL_fmeasure_stderr": 0.0017930595424372638, "rougeL_precision": 0.3270263161597272, "rougeL_precision_stderr": 0.001992100365911566, "rougeL_recall": 0.3590215756959229, "rougeL_recall_stderr": 0.0023843408170312035, "rougeLsum_fmeasure": 0.3800458168267657, "rougeLsum_fmeasure_stderr": 0.002019377882054415, "rougeLsum_precision": 0.3757504695199971, "rougeLsum_precision_stderr": 0.00226312608847076, "rougeLsum_recall": 0.4115209951658525, "rougeLsum_recall_stderr": 0.002643403932683506}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.4185935712214466, "bleu_stderr": 0.11565432283117048, "rouge1_fmeasure": 0.22302727328338223, "rouge1_fmeasure_stderr": 0.002543649203320007, "rouge1_precision": 0.16079100538036975, "rouge1_precision_stderr": 0.0019317155011328527, "rouge1_recall": 0.38463169025745175, "rouge1_recall_stderr": 0.004526539026922928, "rouge2_fmeasure": 0.056442306230818876, "rouge2_fmeasure_stderr": 0.0016804520319396248, "rouge2_precision": 0.03999269013558242, "rouge2_precision_stderr": 0.0011971308201383902, "rouge2_recall": 0.10113339981669754, "rouge2_recall_stderr": 0.0031343748451553286, "rougeL_fmeasure": 0.16717619407627404, "rougeL_fmeasure_stderr": 0.0019481211634922503, "rougeL_precision": 0.12033234686017578, "rougeL_precision_stderr": 0.0014585203532609563, "rougeL_recall": 0.2903778922407286, "rougeL_recall_stderr": 0.003692559366146235, "rougeLsum_fmeasure": 0.17608763758164225, "rougeLsum_fmeasure_stderr": 0.002145834312930374, "rougeLsum_precision": 0.12659083153302422, "rougeLsum_precision_stderr": 0.0015911326356590116, "rougeLsum_recall": 0.30622228336959245, "rougeLsum_recall_stderr": 0.004035233836387343}}, "1": {"article_DOC_summary": {"bleu": 1.4753760157360585, "bleu_stderr": 0.053942288193643405, "rouge1_fmeasure": 0.17409533783255507, "rouge1_fmeasure_stderr": 0.002615183917058679, "rouge1_precision": 0.123975541885696, "rouge1_precision_stderr": 0.001937069913810257, "rouge1_recall": 0.3041634033606052, "rouge1_recall_stderr": 0.004462494037919621, "rouge2_fmeasure": 0.03598895109620969, "rouge2_fmeasure_stderr": 0.001496555270960858, "rouge2_precision": 0.02530985857085611, "rouge2_precision_stderr": 0.0010516965631042789, "rouge2_recall": 0.06497657707857857, "rouge2_recall_stderr": 0.0028062229327394097, "rougeL_fmeasure": 0.13719393806212205, "rougeL_fmeasure_stderr": 0.001988792672158033, "rougeL_precision": 0.09739516038141627, "rougeL_precision_stderr": 0.0014498365795769053, "rougeL_recall": 0.24191669859950216, "rougeL_recall_stderr": 0.0036028516933205405, "rougeLsum_fmeasure": 0.1371396960626393, "rougeLsum_fmeasure_stderr": 0.0021254210571197236, "rougeLsum_precision": 0.0973784808310999, "rougeLsum_precision_stderr": 0.0015541782441477088, "rougeLsum_recall": 0.24166767773481146, "rougeLsum_recall_stderr": 0.0037775246270440404}}, "2": {"article_DOC_summary": {"bleu": 1.4658077495541642, "bleu_stderr": 0.05656663077206122, "rouge1_fmeasure": 0.17850901621898715, "rouge1_fmeasure_stderr": 0.002631185065208956, "rouge1_precision": 0.12701574570405383, "rouge1_precision_stderr": 0.0019497483407488042, "rouge1_recall": 0.31209308297568034, "rouge1_recall_stderr": 0.0044823221140013695, "rouge2_fmeasure": 0.03728951183116744, "rouge2_fmeasure_stderr": 0.0014839130164513042, "rouge2_precision": 0.02628489874277305, "rouge2_precision_stderr": 0.0010496767573945315, "rouge2_recall": 0.06678550185926611, "rouge2_recall_stderr": 0.002741050501727463, "rougeL_fmeasure": 0.14040861303541324, "rougeL_fmeasure_stderr": 0.001957888420474033, "rougeL_precision": 0.09965436966997658, "rougeL_precision_stderr": 0.0014375571522234208, "rougeL_recall": 0.24737259975268608, "rougeL_recall_stderr": 0.003487423921250019, "rougeLsum_fmeasure": 0.1402274218021141, "rougeLsum_fmeasure_stderr": 0.00214102737994103, "rougeLsum_precision": 0.0995909554171278, "rougeLsum_precision_stderr": 0.001569684395558083, "rougeLsum_recall": 0.2465747406900749, "rougeLsum_recall_stderr": 0.0037567150545319778}}}}
|
8b7178b25b/evaluation/rankeval/8b7178b25b_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.334,0.01492201952373296,0
|
3 |
+
anli_r2,acc,0.341,0.014998131348402697,0
|
4 |
+
anli_r3,acc,0.3625,0.013883037874225516,0
|
5 |
+
arc_challenge,acc,0.31399317406143346,0.013562691224726291,0
|
6 |
+
arc_challenge,acc_norm,0.32849829351535836,0.013724978465537364,0
|
7 |
+
arc_easy,acc,0.6616161616161617,0.009709034670525096,0
|
8 |
+
arc_easy,acc_norm,0.5875420875420876,0.01010130544786476,0
|
9 |
+
boolq,acc,0.6333333333333333,0.008428386213506826,1
|
10 |
+
cb,acc,0.3392857142857143,0.06384226561930824,1
|
11 |
+
cb,f1,0.2059178743961352,,1
|
12 |
+
copa,acc,0.87,0.03379976689896309,0
|
13 |
+
hellaswag,acc,0.5325632344154551,0.004979188195338179,0
|
14 |
+
hellaswag,acc_norm,0.7046405098585939,0.004552718360513099,0
|
15 |
+
piqa,acc,0.7704026115342764,0.009812682950815187,0
|
16 |
+
piqa,acc_norm,0.7850924918389554,0.009583665082653316,0
|
17 |
+
rte,acc,0.48014440433212996,0.0300727231673172,0
|
18 |
+
sciq,acc,0.877,0.010391293421849877,0
|
19 |
+
sciq,acc_norm,0.795,0.01277255409611312,0
|
20 |
+
storycloze_2016,acc,0.7482629609834314,0.01003644434459808,0
|
21 |
+
winogrande,acc,0.6187845303867403,0.013650172164160305,0
|
8b7178b25b/evaluation/rankeval/8b7178b25b_0_lm-eval_global_step84877_2023-01-30-20-00-12_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.334,
|
5 |
-
"acc_stderr": 0.01492201952373296
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.341,
|
9 |
-
"acc_stderr": 0.014998131348402697
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3625,
|
13 |
-
"acc_stderr": 0.013883037874225516
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.3392857142857143,
|
17 |
-
"acc_stderr": 0.06384226561930824,
|
18 |
-
"f1": 0.2059178743961352
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.87,
|
22 |
-
"acc_stderr": 0.03379976689896309
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5325632344154551,
|
26 |
-
"acc_stderr": 0.004979188195338179,
|
27 |
-
"acc_norm": 0.7046405098585939,
|
28 |
-
"acc_norm_stderr": 0.004552718360513099
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48014440433212996,
|
32 |
-
"acc_stderr": 0.0300727231673172
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6187845303867403,
|
36 |
-
"acc_stderr": 0.013650172164160305
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7482629609834314,
|
40 |
-
"acc_stderr": 0.01003644434459808
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6333333333333333,
|
44 |
-
"acc_stderr": 0.008428386213506826
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6616161616161617,
|
48 |
-
"acc_stderr": 0.009709034670525096,
|
49 |
-
"acc_norm": 0.5875420875420876,
|
50 |
-
"acc_norm_stderr": 0.01010130544786476
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.31399317406143346,
|
54 |
-
"acc_stderr": 0.013562691224726291,
|
55 |
-
"acc_norm": 0.32849829351535836,
|
56 |
-
"acc_norm_stderr": 0.013724978465537364
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.877,
|
60 |
-
"acc_stderr": 0.010391293421849877,
|
61 |
-
"acc_norm": 0.795,
|
62 |
-
"acc_norm_stderr": 0.01277255409611312
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7704026115342764,
|
66 |
-
"acc_stderr": 0.009812682950815187,
|
67 |
-
"acc_norm": 0.7850924918389554,
|
68 |
-
"acc_norm_stderr": 0.009583665082653316
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b25b/evaluation/rankeval/8b7178b25b_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.334,0.01492201952373297,0
|
3 |
+
anli_r2,acc,0.336,0.01494414023379502,0
|
4 |
+
anli_r3,acc,0.33916666666666667,0.013672343491681819,0
|
5 |
+
arc_challenge,acc,0.33361774744027306,0.01377868705417654,0
|
6 |
+
arc_challenge,acc_norm,0.34897610921501704,0.013928933461382496,0
|
7 |
+
arc_easy,acc,0.6856060606060606,0.009526702423162905,0
|
8 |
+
arc_easy,acc_norm,0.6426767676767676,0.009833205612463107,0
|
9 |
+
boolq,acc,0.6351681957186545,0.00841944098496365,1
|
10 |
+
cb,acc,0.19642857142857142,0.05357142857142859,1
|
11 |
+
cb,f1,0.18920723969812325,,1
|
12 |
+
copa,acc,0.85,0.03588702812826373,0
|
13 |
+
hellaswag,acc,0.5288787094204341,0.004981451704451047,0
|
14 |
+
hellaswag,acc_norm,0.7052380003983271,0.0045500389685506236,0
|
15 |
+
piqa,acc,0.7627856365614799,0.009924694933586364,0
|
16 |
+
piqa,acc_norm,0.7742110990206746,0.009754980670917315,0
|
17 |
+
rte,acc,0.4729241877256318,0.030052303463143706,0
|
18 |
+
sciq,acc,0.922,0.008484573530118581,0
|
19 |
+
sciq,acc_norm,0.897,0.00961683333969579,0
|
20 |
+
storycloze_2016,acc,0.7402458578300374,0.010140244588689848,0
|
21 |
+
winogrande,acc,0.6172059984214681,0.013660946109442013,0
|
8b7178b25b/evaluation/rankeval/8b7178b25b_1_lm-eval_global_step84877_2023-01-30-20-00-12_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.334,
|
5 |
-
"acc_stderr": 0.01492201952373297
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.336,
|
9 |
-
"acc_stderr": 0.01494414023379502
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33916666666666667,
|
13 |
-
"acc_stderr": 0.013672343491681819
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.19642857142857142,
|
17 |
-
"acc_stderr": 0.05357142857142859,
|
18 |
-
"f1": 0.18920723969812325
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.85,
|
22 |
-
"acc_stderr": 0.03588702812826373
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5288787094204341,
|
26 |
-
"acc_stderr": 0.004981451704451047,
|
27 |
-
"acc_norm": 0.7052380003983271,
|
28 |
-
"acc_norm_stderr": 0.0045500389685506236
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4729241877256318,
|
32 |
-
"acc_stderr": 0.030052303463143706
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6172059984214681,
|
36 |
-
"acc_stderr": 0.013660946109442013
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7402458578300374,
|
40 |
-
"acc_stderr": 0.010140244588689848
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6351681957186545,
|
44 |
-
"acc_stderr": 0.00841944098496365
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6856060606060606,
|
48 |
-
"acc_stderr": 0.009526702423162905,
|
49 |
-
"acc_norm": 0.6426767676767676,
|
50 |
-
"acc_norm_stderr": 0.009833205612463107
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.33361774744027306,
|
54 |
-
"acc_stderr": 0.01377868705417654,
|
55 |
-
"acc_norm": 0.34897610921501704,
|
56 |
-
"acc_norm_stderr": 0.013928933461382496
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.922,
|
60 |
-
"acc_stderr": 0.008484573530118581,
|
61 |
-
"acc_norm": 0.897,
|
62 |
-
"acc_norm_stderr": 0.00961683333969579
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7627856365614799,
|
66 |
-
"acc_stderr": 0.009924694933586364,
|
67 |
-
"acc_norm": 0.7742110990206746,
|
68 |
-
"acc_norm_stderr": 0.009754980670917315
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b25b/evaluation/rankeval/8b7178b25b_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.331,0.014888272588203938,0
|
3 |
+
anli_r2,acc,0.324,0.014806864733738863,0
|
4 |
+
anli_r3,acc,0.3416666666666667,0.013696658778002519,0
|
5 |
+
arc_challenge,acc,0.3370307167235495,0.013813476652902265,0
|
6 |
+
arc_challenge,acc_norm,0.35665529010238906,0.013998056902620203,0
|
7 |
+
arc_easy,acc,0.686026936026936,0.00952324533521551,0
|
8 |
+
arc_easy,acc_norm,0.6628787878787878,0.009700146509130068,0
|
9 |
+
boolq,acc,0.6467889908256881,0.008359705247064296,1
|
10 |
+
cb,acc,0.14285714285714285,0.047184161362558305,1
|
11 |
+
cb,f1,0.1381769825918762,,1
|
12 |
+
copa,acc,0.83,0.03775251680686371,0
|
13 |
+
hellaswag,acc,0.5295757817167894,0.004981044370530809,0
|
14 |
+
hellaswag,acc_norm,0.7048396733718383,0.0045518262729780596,0
|
15 |
+
piqa,acc,0.7742110990206746,0.009754980670917315,0
|
16 |
+
piqa,acc_norm,0.7867247007616975,0.00955712122586134,0
|
17 |
+
rte,acc,0.49458483754512633,0.030094698123239966,0
|
18 |
+
sciq,acc,0.938,0.0076298239962803065,0
|
19 |
+
sciq,acc_norm,0.918,0.00868051561552373,0
|
20 |
+
storycloze_2016,acc,0.7514698022447889,0.009993659448666372,0
|
21 |
+
winogrande,acc,0.611681136543015,0.013697456658457232,0
|
8b7178b25b/evaluation/rankeval/8b7178b25b_2_lm-eval_global_step84877_2023-01-30-20-00-12_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.331,
|
5 |
-
"acc_stderr": 0.014888272588203938
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.324,
|
9 |
-
"acc_stderr": 0.014806864733738863
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3416666666666667,
|
13 |
-
"acc_stderr": 0.013696658778002519
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.14285714285714285,
|
17 |
-
"acc_stderr": 0.047184161362558305,
|
18 |
-
"f1": 0.1381769825918762
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.83,
|
22 |
-
"acc_stderr": 0.03775251680686371
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5295757817167894,
|
26 |
-
"acc_stderr": 0.004981044370530809,
|
27 |
-
"acc_norm": 0.7048396733718383,
|
28 |
-
"acc_norm_stderr": 0.0045518262729780596
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.49458483754512633,
|
32 |
-
"acc_stderr": 0.030094698123239966
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.611681136543015,
|
36 |
-
"acc_stderr": 0.013697456658457232
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7514698022447889,
|
40 |
-
"acc_stderr": 0.009993659448666372
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6467889908256881,
|
44 |
-
"acc_stderr": 0.008359705247064296
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.686026936026936,
|
48 |
-
"acc_stderr": 0.00952324533521551,
|
49 |
-
"acc_norm": 0.6628787878787878,
|
50 |
-
"acc_norm_stderr": 0.009700146509130068
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3370307167235495,
|
54 |
-
"acc_stderr": 0.013813476652902265,
|
55 |
-
"acc_norm": 0.35665529010238906,
|
56 |
-
"acc_norm_stderr": 0.013998056902620203
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.938,
|
60 |
-
"acc_stderr": 0.0076298239962803065,
|
61 |
-
"acc_norm": 0.918,
|
62 |
-
"acc_norm_stderr": 0.00868051561552373
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7742110990206746,
|
66 |
-
"acc_stderr": 0.009754980670917315,
|
67 |
-
"acc_norm": 0.7867247007616975,
|
68 |
-
"acc_norm_stderr": 0.00955712122586134
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b25b/evaluation/rankeval/8b7178b25b_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.335,0.014933117490932575,0
|
3 |
+
anli_r2,acc,0.328,0.014853842487270334,0
|
4 |
+
anli_r3,acc,0.3433333333333333,0.01371263383046586,0
|
5 |
+
arc_challenge,acc,0.3378839590443686,0.013822047922283516,0
|
6 |
+
arc_challenge,acc_norm,0.3643344709897611,0.014063260279882412,0
|
7 |
+
arc_easy,acc,0.6957070707070707,0.009441202922359183,0
|
8 |
+
arc_easy,acc_norm,0.6717171717171717,0.00963574950926216,0
|
9 |
+
boolq,acc,0.6440366972477064,0.008374337517726581,1
|
10 |
+
cb,acc,0.14285714285714285,0.047184161362558305,1
|
11 |
+
cb,f1,0.13156966490299823,,1
|
12 |
+
copa,acc,0.84,0.03684529491774709,0
|
13 |
+
hellaswag,acc,0.5320653256323441,0.00497951000177662,0
|
14 |
+
hellaswag,acc_norm,0.7050388368850826,0.004550933142528758,0
|
15 |
+
piqa,acc,0.7736670293797606,0.009763294246879427,0
|
16 |
+
piqa,acc_norm,0.7845484221980413,0.009592463115658107,0
|
17 |
+
rte,acc,0.49097472924187724,0.030091559826331334,0
|
18 |
+
sciq,acc,0.931,0.008018934050315155,0
|
19 |
+
sciq,acc_norm,0.922,0.008484573530118587,0
|
20 |
+
storycloze_2016,acc,0.7536076964190273,0.009964727533753546,0
|
21 |
+
winogrande,acc,0.6148382004735596,0.013676821287521413,0
|
8b7178b25b/evaluation/rankeval/8b7178b25b_3_lm-eval_global_step84877_2023-01-30-20-00-12_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.335,
|
5 |
-
"acc_stderr": 0.014933117490932575
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.328,
|
9 |
-
"acc_stderr": 0.014853842487270334
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3433333333333333,
|
13 |
-
"acc_stderr": 0.01371263383046586
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.14285714285714285,
|
17 |
-
"acc_stderr": 0.047184161362558305,
|
18 |
-
"f1": 0.13156966490299823
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.84,
|
22 |
-
"acc_stderr": 0.03684529491774709
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5320653256323441,
|
26 |
-
"acc_stderr": 0.00497951000177662,
|
27 |
-
"acc_norm": 0.7050388368850826,
|
28 |
-
"acc_norm_stderr": 0.004550933142528758
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.49097472924187724,
|
32 |
-
"acc_stderr": 0.030091559826331334
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6148382004735596,
|
36 |
-
"acc_stderr": 0.013676821287521413
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7536076964190273,
|
40 |
-
"acc_stderr": 0.009964727533753546
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6440366972477064,
|
44 |
-
"acc_stderr": 0.008374337517726581
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6957070707070707,
|
48 |
-
"acc_stderr": 0.009441202922359183,
|
49 |
-
"acc_norm": 0.6717171717171717,
|
50 |
-
"acc_norm_stderr": 0.00963574950926216
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3378839590443686,
|
54 |
-
"acc_stderr": 0.013822047922283516,
|
55 |
-
"acc_norm": 0.3643344709897611,
|
56 |
-
"acc_norm_stderr": 0.014063260279882412
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.931,
|
60 |
-
"acc_stderr": 0.008018934050315155,
|
61 |
-
"acc_norm": 0.922,
|
62 |
-
"acc_norm_stderr": 0.008484573530118587
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7736670293797606,
|
66 |
-
"acc_stderr": 0.009763294246879427,
|
67 |
-
"acc_norm": 0.7845484221980413,
|
68 |
-
"acc_norm_stderr": 0.009592463115658107
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b25b/evaluation/rankeval/8b7178b25b_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.337,0.014955087918653605,0
|
3 |
+
anli_r2,acc,0.336,0.014944140233795018,0
|
4 |
+
anli_r3,acc,0.3375,0.013655897185463658,0
|
5 |
+
arc_challenge,acc,0.3515358361774744,0.01395241369960094,0
|
6 |
+
arc_challenge,acc_norm,0.3660409556313993,0.014077223108470139,0
|
7 |
+
arc_easy,acc,0.6999158249158249,0.00940400055851335,0
|
8 |
+
arc_easy,acc_norm,0.6746632996632996,0.009613427708996187,0
|
9 |
+
boolq,acc,0.6474006116207951,0.00835641249356212,1
|
10 |
+
cb,acc,0.125,0.04459412925079224,1
|
11 |
+
cb,f1,0.10899594232927566,,1
|
12 |
+
copa,acc,0.85,0.03588702812826373,0
|
13 |
+
hellaswag,acc,0.5310695080661223,0.004980138679161042,0
|
14 |
+
hellaswag,acc_norm,0.7102170882294364,0.004527343651130806,0
|
15 |
+
piqa,acc,0.7698585418933623,0.009820832826839815,0
|
16 |
+
piqa,acc_norm,0.7780195865070729,0.009696120744662022,0
|
17 |
+
rte,acc,0.48375451263537905,0.030080573208738064,0
|
18 |
+
sciq,acc,0.934,0.00785529793869759,0
|
19 |
+
sciq,acc_norm,0.93,0.008072494358323508,0
|
20 |
+
storycloze_2016,acc,0.7589524318546232,0.00989094649057693,0
|
21 |
+
winogrande,acc,0.6227308602999211,0.013622567928799501,0
|
8b7178b25b/evaluation/rankeval/8b7178b25b_4_lm-eval_global_step84877_2023-01-30-20-00-12_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.337,
|
5 |
-
"acc_stderr": 0.014955087918653605
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.336,
|
9 |
-
"acc_stderr": 0.014944140233795018
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3375,
|
13 |
-
"acc_stderr": 0.013655897185463658
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.125,
|
17 |
-
"acc_stderr": 0.04459412925079224,
|
18 |
-
"f1": 0.10899594232927566
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.85,
|
22 |
-
"acc_stderr": 0.03588702812826373
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5310695080661223,
|
26 |
-
"acc_stderr": 0.004980138679161042,
|
27 |
-
"acc_norm": 0.7102170882294364,
|
28 |
-
"acc_norm_stderr": 0.004527343651130806
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48375451263537905,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6227308602999211,
|
36 |
-
"acc_stderr": 0.013622567928799501
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7589524318546232,
|
40 |
-
"acc_stderr": 0.00989094649057693
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6474006116207951,
|
44 |
-
"acc_stderr": 0.00835641249356212
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6999158249158249,
|
48 |
-
"acc_stderr": 0.00940400055851335,
|
49 |
-
"acc_norm": 0.6746632996632996,
|
50 |
-
"acc_norm_stderr": 0.009613427708996187
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3515358361774744,
|
54 |
-
"acc_stderr": 0.01395241369960094,
|
55 |
-
"acc_norm": 0.3660409556313993,
|
56 |
-
"acc_norm_stderr": 0.014077223108470139
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.934,
|
60 |
-
"acc_stderr": 0.00785529793869759,
|
61 |
-
"acc_norm": 0.93,
|
62 |
-
"acc_norm_stderr": 0.008072494358323508
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7698585418933623,
|
66 |
-
"acc_stderr": 0.009820832826839815,
|
67 |
-
"acc_norm": 0.7780195865070729,
|
68 |
-
"acc_norm_stderr": 0.009696120744662022
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b25b/evaluation/rankeval/8b7178b25b_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.333,0.01491084616422986,0
|
3 |
+
anli_r2,acc,0.332,0.014899597242811478,0
|
4 |
+
anli_r3,acc,0.33916666666666667,0.013672343491681822,0
|
5 |
+
arc_challenge,acc,0.35665529010238906,0.013998056902620203,0
|
6 |
+
arc_challenge,acc_norm,0.37627986348122866,0.014157022555407173,0
|
7 |
+
arc_easy,acc,0.7028619528619529,0.009377397867796849,0
|
8 |
+
arc_easy,acc_norm,0.6771885521885522,0.009593950220366737,0
|
9 |
+
boolq,acc,0.6486238532110091,0.00834978197660316,1
|
10 |
+
cb,acc,0.14285714285714285,0.04718416136255829,1
|
11 |
+
cb,f1,0.14017094017094014,,1
|
12 |
+
copa,acc,0.86,0.034873508801977725,0
|
13 |
+
hellaswag,acc,0.5324636526588329,0.004979252954977319,0
|
14 |
+
hellaswag,acc_norm,0.7127066321449911,0.004515748192605716,0
|
15 |
+
piqa,acc,0.764961915125136,0.00989314668880531,0
|
16 |
+
piqa,acc_norm,0.7840043525571273,0.009601236303553544,0
|
17 |
+
rte,acc,0.4981949458483754,0.030096267148976626,0
|
18 |
+
sciq,acc,0.938,0.007629823996280306,0
|
19 |
+
sciq,acc_norm,0.93,0.008072494358323508,0
|
20 |
+
storycloze_2016,acc,0.7546766435061465,0.009950137914623096,0
|
21 |
+
winogrande,acc,0.6195737963693765,0.013644727908656833,0
|
8b7178b25b/evaluation/rankeval/8b7178b25b_5_lm-eval_global_step84877_2023-01-30-20-00-12_5shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.333,
|
5 |
-
"acc_stderr": 0.01491084616422986
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.332,
|
9 |
-
"acc_stderr": 0.014899597242811478
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33916666666666667,
|
13 |
-
"acc_stderr": 0.013672343491681822
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.14285714285714285,
|
17 |
-
"acc_stderr": 0.04718416136255829,
|
18 |
-
"f1": 0.14017094017094014
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.86,
|
22 |
-
"acc_stderr": 0.034873508801977725
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5324636526588329,
|
26 |
-
"acc_stderr": 0.004979252954977319,
|
27 |
-
"acc_norm": 0.7127066321449911,
|
28 |
-
"acc_norm_stderr": 0.004515748192605716
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.4981949458483754,
|
32 |
-
"acc_stderr": 0.030096267148976626
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6195737963693765,
|
36 |
-
"acc_stderr": 0.013644727908656833
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7546766435061465,
|
40 |
-
"acc_stderr": 0.009950137914623096
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6486238532110091,
|
44 |
-
"acc_stderr": 0.00834978197660316
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.7028619528619529,
|
48 |
-
"acc_stderr": 0.009377397867796849,
|
49 |
-
"acc_norm": 0.6771885521885522,
|
50 |
-
"acc_norm_stderr": 0.009593950220366737
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.35665529010238906,
|
54 |
-
"acc_stderr": 0.013998056902620203,
|
55 |
-
"acc_norm": 0.37627986348122866,
|
56 |
-
"acc_norm_stderr": 0.014157022555407173
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.938,
|
60 |
-
"acc_stderr": 0.007629823996280306,
|
61 |
-
"acc_norm": 0.93,
|
62 |
-
"acc_norm_stderr": 0.008072494358323508
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.764961915125136,
|
66 |
-
"acc_stderr": 0.00989314668880531,
|
67 |
-
"acc_norm": 0.7840043525571273,
|
68 |
-
"acc_norm_stderr": 0.009601236303553544
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b35b/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.010042498177274786
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.010042498177274786
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.17310173953900088
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.17310173953900088
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.20571413885055867
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.20571413885055867
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.21198726153120898
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.21198726153120898
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2155975402534293
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2155975402534293
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.21627498227149344
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.21627498227149344
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.17211969343716102
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05332080380122619
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.05332080380122619
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04451173090845151
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.04451173090845151
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05083267238349886
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.05083267238349886
|
21 |
+
gem_xsum,2,average,multiple,0.04955506903105885
|
22 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05212321093989505
|
23 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.05212321093989505
|
24 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.058822699744876736
|
25 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.058822699744876736
|
26 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.059482069465634994
|
27 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.059482069465634994
|
28 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05931767519712951
|
29 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.05931767519712951
|
30 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05886517252410502
|
31 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.05886517252410502
|
32 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.059919950381812144
|
33 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.059919950381812144
|
34 |
+
web_nlg_en,5,average,multiple,0.058088463042242244
|
35 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03900064299554168
|
36 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.03900064299554168
|
37 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06561109570973458
|
38 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.06561109570973458
|
39 |
+
wiki_lingua_en,1,average,multiple,0.05230586935263813
|
8b7178b35b/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.32917980943922837, "bleu_stderr": 0.028436979522399264, "rouge1_fmeasure": 0.11092359878172349, "rouge1_fmeasure_stderr": 0.0021083743244835647, "rouge1_precision": 0.07454038083418474, "rouge1_precision_stderr": 0.001766454284632946, "rouge1_recall": 0.30290353802392916, "rouge1_recall_stderr": 0.004835002819067963, "rouge2_fmeasure": 0.05212321093989505, "rouge2_fmeasure_stderr": 0.0013412640798549406, "rouge2_precision": 0.03405534335780326, "rouge2_precision_stderr": 0.0009795255375204467, "rouge2_recall": 0.14725829338052365, "rouge2_recall_stderr": 0.0033496237842817397, "rougeL_fmeasure": 0.1059469350929842, "rougeL_fmeasure_stderr": 0.0019397934888169463, "rougeL_precision": 0.07086594439969038, "rougeL_precision_stderr": 0.0016082892660627373, "rougeL_recall": 0.29226896530687846, "rougeL_recall_stderr": 0.00466575095416219, "rougeLsum_fmeasure": 0.1060475373688485, "rougeLsum_fmeasure_stderr": 0.0019809059744097615, "rougeLsum_precision": 0.0711343154862909, "rougeLsum_precision_stderr": 0.001649638611842115, "rougeLsum_recall": 0.2903231316909427, "rougeLsum_recall_stderr": 0.004582555178091478}}, "1": {"PALM_prompt": {"bleu": 0.5285748263104777, "bleu_stderr": 0.05154539253732287, "rouge1_fmeasure": 0.1218665522358165, "rouge1_fmeasure_stderr": 0.0019158434392141466, "rouge1_precision": 0.07826126347996014, "rouge1_precision_stderr": 0.001412323883175065, "rouge1_recall": 0.38231102690350377, "rouge1_recall_stderr": 0.0053881864820272286, "rouge2_fmeasure": 0.058822699744876736, "rouge2_fmeasure_stderr": 0.0012405607498159584, "rouge2_precision": 0.037391127717872924, "rouge2_precision_stderr": 0.0008664479676328054, "rouge2_recall": 0.19839458948033734, "rouge2_recall_stderr": 0.004010973785657583, "rougeL_fmeasure": 0.11559328292690177, "rougeL_fmeasure_stderr": 0.001764131335387481, "rougeL_precision": 0.07407975935897594, "rougeL_precision_stderr": 0.0012854765812861943, "rougeL_recall": 0.3629315958992141, "rougeL_recall_stderr": 0.005069344323262684, "rougeLsum_fmeasure": 0.11544948693652962, "rougeLsum_fmeasure_stderr": 0.0017842924340211324, "rougeLsum_precision": 0.07415121638849131, "rougeLsum_precision_stderr": 0.001318142684082723, "rougeLsum_recall": 0.3614370946625198, "rougeLsum_recall_stderr": 0.004964006794864235}}, "2": {"PALM_prompt": {"bleu": 0.6066453441760302, "bleu_stderr": 0.02113523644793951, "rouge1_fmeasure": 0.12529215862638043, "rouge1_fmeasure_stderr": 0.0018259072915703836, "rouge1_precision": 0.08017864305448325, "rouge1_precision_stderr": 0.0014244751923494794, "rouge1_recall": 0.40480204267227765, "rouge1_recall_stderr": 0.005138963278066339, "rouge2_fmeasure": 0.059482069465634994, "rouge2_fmeasure_stderr": 0.0011595782202414246, "rouge2_precision": 0.03746116833011539, "rouge2_precision_stderr": 0.0008054382500683709, "rouge2_recall": 0.2078327469176558, "rouge2_recall_stderr": 0.003905498610846629, "rougeL_fmeasure": 0.11765794068577944, "rougeL_fmeasure_stderr": 0.0016600536639441099, "rougeL_precision": 0.0752258255540516, "rougeL_precision_stderr": 0.0013017426600869019, "rougeL_recall": 0.37921672738638584, "rougeL_recall_stderr": 0.004724125031038082, "rougeLsum_fmeasure": 0.11886578443137756, "rougeLsum_fmeasure_stderr": 0.001710265056607144, "rougeLsum_precision": 0.07612402487838264, "rougeLsum_precision_stderr": 0.0013499817758363393, "rougeLsum_recall": 0.3833520840989382, "rougeLsum_recall_stderr": 0.004787520387833171}}, "3": {"PALM_prompt": {"bleu": 0.6647933502215259, "bleu_stderr": 0.04923433986433192, "rouge1_fmeasure": 0.12320395767637426, "rouge1_fmeasure_stderr": 0.0017848979409527468, "rouge1_precision": 0.0781723421483203, "rouge1_precision_stderr": 0.0013023299811420648, "rouge1_recall": 0.4046567231900049, "rouge1_recall_stderr": 0.005137890673994109, "rouge2_fmeasure": 0.05931767519712951, "rouge2_fmeasure_stderr": 0.0011649351056668448, "rouge2_precision": 0.037339890555931926, "rouge2_precision_stderr": 0.0008144074272058679, "rouge2_recall": 0.2106193724526676, "rouge2_recall_stderr": 0.003927640735394401, "rougeL_fmeasure": 0.1150473458151772, "rougeL_fmeasure_stderr": 0.001619592187307252, "rougeL_precision": 0.07298298160971306, "rougeL_precision_stderr": 0.0011729650830812643, "rougeL_recall": 0.37541756571956564, "rougeL_recall_stderr": 0.004598099062061018, "rougeLsum_fmeasure": 0.1167920444605632, "rougeLsum_fmeasure_stderr": 0.0016685972133379994, "rougeLsum_precision": 0.0741403065968848, "rougeLsum_precision_stderr": 0.0012209548996141538, "rougeLsum_recall": 0.38296199684709586, "rougeLsum_recall_stderr": 0.004753402389332228}}, "4": {"PALM_prompt": {"bleu": 0.6566656848041873, "bleu_stderr": 0.03433459283803238, "rouge1_fmeasure": 0.12360753221007545, "rouge1_fmeasure_stderr": 0.001737583890202225, "rouge1_precision": 0.07771155841369212, "rouge1_precision_stderr": 0.0012374008770267796, "rouge1_recall": 0.4160226119034251, "rouge1_recall_stderr": 0.005178203274221526, "rouge2_fmeasure": 0.05886517252410502, "rouge2_fmeasure_stderr": 0.0011242425541824303, "rouge2_precision": 0.036708952345678336, "rouge2_precision_stderr": 0.0007671406759682118, "rouge2_recall": 0.2142530872032143, "rouge2_recall_stderr": 0.003938344188890309, "rougeL_fmeasure": 0.11421873690833906, "rougeL_fmeasure_stderr": 0.0015701261033135242, "rougeL_precision": 0.07189004426671647, "rougeL_precision_stderr": 0.001119612418604389, "rougeL_recall": 0.381893221081078, "rougeL_recall_stderr": 0.004577229345577291, "rougeLsum_fmeasure": 0.11777877062717519, "rougeLsum_fmeasure_stderr": 0.0016474868402205588, "rougeLsum_precision": 0.0741079421040528, "rougeLsum_precision_stderr": 0.0011766632070640108, "rougeLsum_recall": 0.39536320623753024, "rougeLsum_recall_stderr": 0.004834163190323313}}, "5": {"PALM_prompt": {"bleu": 0.7608515369903156, "bleu_stderr": 0.05544338902785279, "rouge1_fmeasure": 0.12542945154484628, "rouge1_fmeasure_stderr": 0.0017246642532628591, "rouge1_precision": 0.07874251282939974, "rouge1_precision_stderr": 0.0012390781355337446, "rouge1_recall": 0.4274915903037139, "rouge1_recall_stderr": 0.0052178214889688, "rouge2_fmeasure": 0.059919950381812144, "rouge2_fmeasure_stderr": 0.0011224239943820847, "rouge2_precision": 0.03725892588478996, "rouge2_precision_stderr": 0.0007691554935322187, "rouge2_recall": 0.22193078536394883, "rouge2_recall_stderr": 0.003999018222823969, "rougeL_fmeasure": 0.11503193912763204, "rougeL_fmeasure_stderr": 0.0015332329154476773, "rougeL_precision": 0.07225670171843512, "rougeL_precision_stderr": 0.001101933354024317, "rougeL_recall": 0.390252551254915, "rougeL_recall_stderr": 0.004591599013709285, "rougeLsum_fmeasure": 0.11871364855558929, "rougeLsum_fmeasure_stderr": 0.0016202511632686142, "rougeLsum_precision": 0.07457349456330893, "rougeLsum_precision_stderr": 0.0011662890645907052, "rougeLsum_recall": 0.40390693081462914, "rougeLsum_recall_stderr": 0.004850186326089204}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.7311283501769414, "bleu_stderr": 0.07292848467922458, "rouge1_fmeasure": 0.18112864366064585, "rouge1_fmeasure_stderr": 0.0019261005266957636, "rouge1_precision": 0.15412231045043578, "rouge1_precision_stderr": 0.0019133423603737852, "rouge1_recall": 0.26596610436897833, "rouge1_recall_stderr": 0.0029078617781084343, "rouge2_fmeasure": 0.03900064299554168, "rouge2_fmeasure_stderr": 0.0009076451372655918, "rouge2_precision": 0.032745788295643126, "rouge2_precision_stderr": 0.000790295587407252, "rouge2_recall": 0.06015567293207339, "rouge2_recall_stderr": 0.0015850260882465787, "rougeL_fmeasure": 0.13940156755327857, "rougeL_fmeasure_stderr": 0.0013668263941440067, "rougeL_precision": 0.1171936085278983, "rougeL_precision_stderr": 0.00132039683220814, "rougeL_recall": 0.21005905052835785, "rougeL_recall_stderr": 0.002357155817310683, "rougeLsum_fmeasure": 0.16698321861271978, "rougeLsum_fmeasure_stderr": 0.0017578260329471065, "rougeLsum_precision": 0.14187748986714835, "rougeLsum_precision_stderr": 0.0017420221740131032, "rougeLsum_recall": 0.2462694168340833, "rougeLsum_recall_stderr": 0.002709379098155295}}, "1": {"tldr_en": {"bleu": 3.5922679202027847, "bleu_stderr": 0.05984463327930867, "rouge1_fmeasure": 0.24327622952771533, "rouge1_fmeasure_stderr": 0.002037288100861591, "rouge1_precision": 0.2123430992404537, "rouge1_precision_stderr": 0.002316190617323844, "rouge1_recall": 0.349869999562048, "rouge1_recall_stderr": 0.0028776097446615145, "rouge2_fmeasure": 0.06561109570973458, "rouge2_fmeasure_stderr": 0.0011754883309790891, "rouge2_precision": 0.05726169522995292, "rouge2_precision_stderr": 0.0011305457211571272, "rouge2_recall": 0.09761319314398223, "rouge2_recall_stderr": 0.001931346687472591, "rougeL_fmeasure": 0.17044218669835878, "rougeL_fmeasure_stderr": 0.001393233918351284, "rougeL_precision": 0.147593790999516, "rougeL_precision_stderr": 0.0015843013179494662, "rougeL_recall": 0.25163325238017475, "rougeL_recall_stderr": 0.0023185700212327545, "rougeLsum_fmeasure": 0.22970291581408894, "rougeLsum_fmeasure_stderr": 0.0019241182984497916, "rougeLsum_precision": 0.20040929059968782, "rougeLsum_precision_stderr": 0.0021906873980096493, "rougeLsum_recall": 0.33105684407128727, "rougeLsum_recall_stderr": 0.0027539492668926546}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.17854969577688523, "bleu_stderr": 0.02350515901837014, "rouge1_fmeasure": 0.14982661873128114, "rouge1_fmeasure_stderr": 0.0010486948402501617, "rouge1_precision": 0.34481003507654684, "rouge1_precision_stderr": 0.0024960642876163215, "rouge1_recall": 0.11140479511426793, "rouge1_recall_stderr": 0.0012326825192403048, "rouge2_fmeasure": 0.010042498177274786, "rouge2_fmeasure_stderr": 0.0004526496809829142, "rouge2_precision": 0.02652033223020393, "rouge2_precision_stderr": 0.0012104924394046484, "rouge2_recall": 0.007738642998342415, "rouge2_recall_stderr": 0.00040331722748865453, "rougeL_fmeasure": 0.13556698518148783, "rougeL_fmeasure_stderr": 0.0010029610186637107, "rougeL_precision": 0.31148977629184244, "rougeL_precision_stderr": 0.0024243327495898086, "rougeL_recall": 0.10191142585459571, "rougeL_recall_stderr": 0.0012109507603402815, "rougeLsum_fmeasure": 0.1364054799839495, "rougeLsum_fmeasure_stderr": 0.0010086900095262318, "rougeLsum_precision": 0.3173006007013496, "rougeLsum_precision_stderr": 0.002513826319580817, "rougeLsum_recall": 0.10030998557950792, "rougeLsum_recall_stderr": 0.0010758913304541808}}, "1": {"generate_text_restaurant": {"bleu": 8.550714951198435, "bleu_stderr": 0.06794955954281734, "rouge1_fmeasure": 0.3940891272584714, "rouge1_fmeasure_stderr": 0.00233253448293986, "rouge1_precision": 0.37441894592411723, "rouge1_precision_stderr": 0.00285043399066679, "rouge1_recall": 0.4681809355112701, "rouge1_recall_stderr": 0.0029387631347472464, "rouge2_fmeasure": 0.17310173953900088, "rouge2_fmeasure_stderr": 0.0017673218319637287, "rouge2_precision": 0.16514863563264978, "rouge2_precision_stderr": 0.0019360690941877992, "rouge2_recall": 0.20649495583013436, "rouge2_recall_stderr": 0.0021624061718912763, "rougeL_fmeasure": 0.2901588169021053, "rougeL_fmeasure_stderr": 0.0017017464729554115, "rougeL_precision": 0.2743117208263101, "rougeL_precision_stderr": 0.0021002662608094564, "rougeL_recall": 0.34924701312209255, "rougeL_recall_stderr": 0.0023705538319379695, "rougeLsum_fmeasure": 0.327671306182913, "rougeLsum_fmeasure_stderr": 0.0022171207606254796, "rougeLsum_precision": 0.31201556645452433, "rougeLsum_precision_stderr": 0.002622342685847327, "rougeLsum_recall": 0.3885350242195471, "rougeLsum_recall_stderr": 0.0027432106106179788}}, "2": {"generate_text_restaurant": {"bleu": 11.867888027230435, "bleu_stderr": 0.15989930331829488, "rouge1_fmeasure": 0.4442817836593718, "rouge1_fmeasure_stderr": 0.0019988950330102873, "rouge1_precision": 0.44246406218448636, "rouge1_precision_stderr": 0.002320846131050864, "rouge1_recall": 0.4822237492014395, "rouge1_recall_stderr": 0.0028480697292934715, "rouge2_fmeasure": 0.20571413885055867, "rouge2_fmeasure_stderr": 0.001815833869403785, "rouge2_precision": 0.20447987966953268, "rouge2_precision_stderr": 0.0019087131351150518, "rouge2_recall": 0.22575098541036898, "rouge2_recall_stderr": 0.0022679440038537117, "rougeL_fmeasure": 0.3232871465388061, "rougeL_fmeasure_stderr": 0.0017540023019821938, "rougeL_precision": 0.3220661789895318, "rougeL_precision_stderr": 0.0019807481977887516, "rougeL_recall": 0.35167106294020406, "rougeL_recall_stderr": 0.002412719701186727, "rougeLsum_fmeasure": 0.37187691948404467, "rougeLsum_fmeasure_stderr": 0.0020334507090512886, "rougeLsum_precision": 0.37040704761920407, "rougeLsum_precision_stderr": 0.002271535999762247, "rougeLsum_recall": 0.4036635693660681, "rougeLsum_recall_stderr": 0.0027100283713887775}}, "3": {"generate_text_restaurant": {"bleu": 12.176808245577714, "bleu_stderr": 0.1522141950985559, "rouge1_fmeasure": 0.4492578440402226, "rouge1_fmeasure_stderr": 0.0019667066974463896, "rouge1_precision": 0.44578562677217626, "rouge1_precision_stderr": 0.0022755220565382653, "rouge1_recall": 0.4889266631673112, "rouge1_recall_stderr": 0.0028806835048615135, "rouge2_fmeasure": 0.21198726153120898, "rouge2_fmeasure_stderr": 0.0018195540133290194, "rouge2_precision": 0.20934965847705986, "rouge2_precision_stderr": 0.0018748523719357, "rouge2_recall": 0.23374791215018093, "rouge2_recall_stderr": 0.0023263783868214196, "rougeL_fmeasure": 0.3276087887468011, "rougeL_fmeasure_stderr": 0.0017546007227396715, "rougeL_precision": 0.32511799875091574, "rougeL_precision_stderr": 0.001967046506806585, "rougeL_recall": 0.35739711489377646, "rougeL_recall_stderr": 0.002465697341506471, "rougeLsum_fmeasure": 0.3771411500357944, "rougeLsum_fmeasure_stderr": 0.0020137589520510426, "rougeLsum_precision": 0.3741382823179302, "rougeLsum_precision_stderr": 0.002234766274837387, "rougeLsum_recall": 0.4108171966535684, "rougeLsum_recall_stderr": 0.0027621282248548955}}, "4": {"generate_text_restaurant": {"bleu": 12.402384292539924, "bleu_stderr": 0.135024423037405, "rouge1_fmeasure": 0.4552455893734124, "rouge1_fmeasure_stderr": 0.0019583919126399917, "rouge1_precision": 0.4500297698600483, "rouge1_precision_stderr": 0.002279184782036985, "rouge1_recall": 0.49411520793728253, "rouge1_recall_stderr": 0.0028065754284846934, "rouge2_fmeasure": 0.2155975402534293, "rouge2_fmeasure_stderr": 0.001873826925854458, "rouge2_precision": 0.2124489885242478, "rouge2_precision_stderr": 0.001924872387744921, "rouge2_recall": 0.23653217550233827, "rouge2_recall_stderr": 0.0023355573505469554, "rougeL_fmeasure": 0.33116918396735845, "rougeL_fmeasure_stderr": 0.0017792810189211944, "rougeL_precision": 0.32709295569865143, "rougeL_precision_stderr": 0.001956608790763274, "rougeL_recall": 0.3602236836443227, "rougeL_recall_stderr": 0.002427496751180958, "rougeLsum_fmeasure": 0.3808847920405725, "rougeLsum_fmeasure_stderr": 0.002065156905180787, "rougeLsum_precision": 0.37596847671524797, "rougeLsum_precision_stderr": 0.002251960910942368, "rougeLsum_recall": 0.41399734380099557, "rougeLsum_recall_stderr": 0.0027661212857413163}}, "5": {"generate_text_restaurant": {"bleu": 12.262420161138401, "bleu_stderr": 0.18903914838075944, "rouge1_fmeasure": 0.45590768378496416, "rouge1_fmeasure_stderr": 0.001962173998693794, "rouge1_precision": 0.4494732628845236, "rouge1_precision_stderr": 0.0023023795088449608, "rouge1_recall": 0.49537684561519607, "rouge1_recall_stderr": 0.002767197249479356, "rouge2_fmeasure": 0.21627498227149344, "rouge2_fmeasure_stderr": 0.0018499821709783027, "rouge2_precision": 0.21297442064638186, "rouge2_precision_stderr": 0.001926070075686657, "rouge2_recall": 0.23703970881860875, "rouge2_recall_stderr": 0.0022735756287929677, "rougeL_fmeasure": 0.3331238951307013, "rougeL_fmeasure_stderr": 0.0017799650141595914, "rougeL_precision": 0.3281425210661801, "rougeL_precision_stderr": 0.001972901049188069, "rougeL_recall": 0.3628730729029585, "rougeL_recall_stderr": 0.0024176721948621425, "rougeLsum_fmeasure": 0.38246870938641964, "rougeLsum_fmeasure_stderr": 0.0020417804934292076, "rougeLsum_precision": 0.3768630011135112, "rougeLsum_precision_stderr": 0.0022664490914563948, "rougeLsum_recall": 0.41605530731114854, "rougeLsum_recall_stderr": 0.0027097026120309876}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.24608084382355, "bleu_stderr": 0.11799890497604383, "rouge1_fmeasure": 0.22178670977262957, "rouge1_fmeasure_stderr": 0.0026603762671440385, "rouge1_precision": 0.17400888678487844, "rouge1_precision_stderr": 0.002464330237468343, "rouge1_recall": 0.3502023417511915, "rouge1_recall_stderr": 0.004626152702942569, "rouge2_fmeasure": 0.05332080380122619, "rouge2_fmeasure_stderr": 0.0017188362970072153, "rouge2_precision": 0.040704743736010573, "rouge2_precision_stderr": 0.001393829442555013, "rouge2_recall": 0.08799878519146338, "rouge2_recall_stderr": 0.0029196986234636744, "rougeL_fmeasure": 0.16609949129334142, "rougeL_fmeasure_stderr": 0.0020543947028805716, "rougeL_precision": 0.12992137120722907, "rougeL_precision_stderr": 0.0018781656373678345, "rougeL_recall": 0.2641999836224796, "rougeL_recall_stderr": 0.0036812097901456225, "rougeLsum_fmeasure": 0.17331928576093303, "rougeLsum_fmeasure_stderr": 0.0022434557390865763, "rougeLsum_precision": 0.13526811734802355, "rougeLsum_precision_stderr": 0.001975414967835506, "rougeLsum_recall": 0.27591607890974656, "rougeLsum_recall_stderr": 0.004020504841332198}}, "1": {"article_DOC_summary": {"bleu": 1.8189118554354626, "bleu_stderr": 0.10029587869243096, "rouge1_fmeasure": 0.19671334605636295, "rouge1_fmeasure_stderr": 0.0027188446769093686, "rouge1_precision": 0.14019002697714425, "rouge1_precision_stderr": 0.002033439876939429, "rouge1_recall": 0.3437328928644267, "rouge1_recall_stderr": 0.004622184360550014, "rouge2_fmeasure": 0.04451173090845151, "rouge2_fmeasure_stderr": 0.0016237255917685457, "rouge2_precision": 0.031342583459769556, "rouge2_precision_stderr": 0.0011487670391079187, "rouge2_recall": 0.0803191311481453, "rouge2_recall_stderr": 0.003010334414958687, "rougeL_fmeasure": 0.14825510655652663, "rougeL_fmeasure_stderr": 0.0020028300424767575, "rougeL_precision": 0.1053445488770993, "rougeL_precision_stderr": 0.001476955053927121, "rougeL_recall": 0.2613986979531857, "rougeL_recall_stderr": 0.0035961912243557873, "rougeLsum_fmeasure": 0.1592072439127538, "rougeLsum_fmeasure_stderr": 0.002270863529627419, "rougeLsum_precision": 0.11322117707364136, "rougeLsum_precision_stderr": 0.001678631461460253, "rougeLsum_recall": 0.27993778037490374, "rougeLsum_recall_stderr": 0.003988822898958396}}, "2": {"article_DOC_summary": {"bleu": 2.0454443359589503, "bleu_stderr": 0.08368970692051826, "rouge1_fmeasure": 0.20889368966335237, "rouge1_fmeasure_stderr": 0.0026887281040736378, "rouge1_precision": 0.1488455450923004, "rouge1_precision_stderr": 0.002009481565468108, "rouge1_recall": 0.3642751817540943, "rouge1_recall_stderr": 0.004565498697439836, "rouge2_fmeasure": 0.05083267238349886, "rouge2_fmeasure_stderr": 0.001670794310427006, "rouge2_precision": 0.03580310577134537, "rouge2_precision_stderr": 0.0011818056791973873, "rouge2_recall": 0.09140711571916593, "rouge2_recall_stderr": 0.003096432205185466, "rougeL_fmeasure": 0.15708491403271296, "rougeL_fmeasure_stderr": 0.0019982552541995204, "rougeL_precision": 0.11164126156649677, "rougeL_precision_stderr": 0.001473760236619727, "rougeL_recall": 0.27611637516436727, "rougeL_recall_stderr": 0.003575935893484929, "rougeLsum_fmeasure": 0.1672065849237029, "rougeLsum_fmeasure_stderr": 0.002247088064282535, "rougeLsum_precision": 0.1188385887857819, "rougeLsum_precision_stderr": 0.0016510150185409913, "rougeLsum_recall": 0.2937542835590118, "rougeLsum_recall_stderr": 0.003999046140359821}}}}
|
8b7178b35b/evaluation/rankeval/8b7178b35b_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.334,0.01492201952373296,0
|
3 |
+
anli_r2,acc,0.341,0.0149981313484027,0
|
4 |
+
anli_r3,acc,0.3358333333333333,0.013639261190932882,0
|
5 |
+
arc_challenge,acc,0.3148464163822526,0.013572657703084948,0
|
6 |
+
arc_challenge,acc_norm,0.34897610921501704,0.013928933461382496,0
|
7 |
+
arc_easy,acc,0.6599326599326599,0.009720765494805276,0
|
8 |
+
arc_easy,acc_norm,0.5984848484848485,0.010058790020755562,0
|
9 |
+
boolq,acc,0.5951070336391437,0.008585393347962317,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.18803418803418803,,1
|
12 |
+
copa,acc,0.82,0.038612291966536955,0
|
13 |
+
hellaswag,acc,0.5300736904999004,0.004980747448813311,0
|
14 |
+
hellaswag,acc_norm,0.7024497112129058,0.004562462665505219,0
|
15 |
+
piqa,acc,0.779107725788901,0.009679088048842217,0
|
16 |
+
piqa,acc_norm,0.7878128400435256,0.009539299828174044,0
|
17 |
+
rte,acc,0.5667870036101083,0.029826764082138274,0
|
18 |
+
sciq,acc,0.894,0.009739551265785141,0
|
19 |
+
sciq,acc_norm,0.828,0.011939788882495321,0
|
20 |
+
storycloze_2016,acc,0.7541421699625869,0.009957443066942233,0
|
21 |
+
winogrande,acc,0.6227308602999211,0.013622567928799503,0
|
8b7178b35b/evaluation/rankeval/8b7178b35b_0_lm-eval_global_step84877_2023-01-30-20-00-09_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.334,
|
5 |
-
"acc_stderr": 0.01492201952373296
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.341,
|
9 |
-
"acc_stderr": 0.0149981313484027
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3358333333333333,
|
13 |
-
"acc_stderr": 0.013639261190932882
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.18803418803418803
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.82,
|
22 |
-
"acc_stderr": 0.038612291966536955
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5300736904999004,
|
26 |
-
"acc_stderr": 0.004980747448813311,
|
27 |
-
"acc_norm": 0.7024497112129058,
|
28 |
-
"acc_norm_stderr": 0.004562462665505219
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5667870036101083,
|
32 |
-
"acc_stderr": 0.029826764082138274
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6227308602999211,
|
36 |
-
"acc_stderr": 0.013622567928799503
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7541421699625869,
|
40 |
-
"acc_stderr": 0.009957443066942233
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5951070336391437,
|
44 |
-
"acc_stderr": 0.008585393347962317
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6599326599326599,
|
48 |
-
"acc_stderr": 0.009720765494805276,
|
49 |
-
"acc_norm": 0.5984848484848485,
|
50 |
-
"acc_norm_stderr": 0.010058790020755562
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3148464163822526,
|
54 |
-
"acc_stderr": 0.013572657703084948,
|
55 |
-
"acc_norm": 0.34897610921501704,
|
56 |
-
"acc_norm_stderr": 0.013928933461382496
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.894,
|
60 |
-
"acc_stderr": 0.009739551265785141,
|
61 |
-
"acc_norm": 0.828,
|
62 |
-
"acc_norm_stderr": 0.011939788882495321
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.779107725788901,
|
66 |
-
"acc_stderr": 0.009679088048842217,
|
67 |
-
"acc_norm": 0.7878128400435256,
|
68 |
-
"acc_norm_stderr": 0.009539299828174044
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b35b/evaluation/rankeval/8b7178b35b_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.328,0.014853842487270334,0
|
3 |
+
anli_r2,acc,0.321,0.014770821817934649,0
|
4 |
+
anli_r3,acc,0.3491666666666667,0.013767075395077249,0
|
5 |
+
arc_challenge,acc,0.3250853242320819,0.013688147309729124,0
|
6 |
+
arc_challenge,acc_norm,0.35665529010238906,0.013998056902620199,0
|
7 |
+
arc_easy,acc,0.6742424242424242,0.009616642976885964,0
|
8 |
+
arc_easy,acc_norm,0.6405723905723906,0.009845958893373752,0
|
9 |
+
boolq,acc,0.6489296636085627,0.00834811495726361,1
|
10 |
+
cb,acc,0.4642857142857143,0.06724777654937658,1
|
11 |
+
cb,f1,0.3013448230839535,,1
|
12 |
+
copa,acc,0.79,0.040936018074033256,0
|
13 |
+
hellaswag,acc,0.5251941844254132,0.00498344288867777,0
|
14 |
+
hellaswag,acc_norm,0.6989643497311293,0.0045777070250313644,0
|
15 |
+
piqa,acc,0.7747551686615887,0.009746643471032145,0
|
16 |
+
piqa,acc_norm,0.779651795429815,0.00967053545685313,0
|
17 |
+
rte,acc,0.5342960288808665,0.030025579819366426,0
|
18 |
+
sciq,acc,0.916,0.008776162089491127,0
|
19 |
+
sciq,acc_norm,0.897,0.009616833339695792,0
|
20 |
+
storycloze_2016,acc,0.7504008551576697,0.010008002459430844,0
|
21 |
+
winogrande,acc,0.6108918705603789,0.013702520871485945,0
|
8b7178b35b/evaluation/rankeval/8b7178b35b_1_lm-eval_global_step84877_2023-01-30-20-00-09_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.328,
|
5 |
-
"acc_stderr": 0.014853842487270334
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.321,
|
9 |
-
"acc_stderr": 0.014770821817934649
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3491666666666667,
|
13 |
-
"acc_stderr": 0.013767075395077249
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4642857142857143,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.3013448230839535
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5251941844254132,
|
26 |
-
"acc_stderr": 0.00498344288867777,
|
27 |
-
"acc_norm": 0.6989643497311293,
|
28 |
-
"acc_norm_stderr": 0.0045777070250313644
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5342960288808665,
|
32 |
-
"acc_stderr": 0.030025579819366426
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6108918705603789,
|
36 |
-
"acc_stderr": 0.013702520871485945
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7504008551576697,
|
40 |
-
"acc_stderr": 0.010008002459430844
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6489296636085627,
|
44 |
-
"acc_stderr": 0.00834811495726361
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6742424242424242,
|
48 |
-
"acc_stderr": 0.009616642976885964,
|
49 |
-
"acc_norm": 0.6405723905723906,
|
50 |
-
"acc_norm_stderr": 0.009845958893373752
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3250853242320819,
|
54 |
-
"acc_stderr": 0.013688147309729124,
|
55 |
-
"acc_norm": 0.35665529010238906,
|
56 |
-
"acc_norm_stderr": 0.013998056902620199
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.916,
|
60 |
-
"acc_stderr": 0.008776162089491127,
|
61 |
-
"acc_norm": 0.897,
|
62 |
-
"acc_norm_stderr": 0.009616833339695792
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7747551686615887,
|
66 |
-
"acc_stderr": 0.009746643471032145,
|
67 |
-
"acc_norm": 0.779651795429815,
|
68 |
-
"acc_norm_stderr": 0.00967053545685313
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b35b/evaluation/rankeval/8b7178b35b_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.319,0.014746404865473486,0
|
3 |
+
anli_r2,acc,0.33,0.01487687202745673,0
|
4 |
+
anli_r3,acc,0.32,0.013471620929769142,0
|
5 |
+
arc_challenge,acc,0.33276450511945393,0.01376986304619231,0
|
6 |
+
arc_challenge,acc_norm,0.3643344709897611,0.014063260279882415,0
|
7 |
+
arc_easy,acc,0.6784511784511784,0.00958409157564062,0
|
8 |
+
arc_easy,acc_norm,0.6607744107744108,0.00971491720776585,0
|
9 |
+
boolq,acc,0.6590214067278287,0.00829097981816109,1
|
10 |
+
cb,acc,0.42857142857142855,0.06672848092813058,1
|
11 |
+
cb,f1,0.3049446887911502,,1
|
12 |
+
copa,acc,0.82,0.038612291966536955,0
|
13 |
+
hellaswag,acc,0.5259908384783908,0.004983035420235712,0
|
14 |
+
hellaswag,acc_norm,0.7002589125672177,0.004572081656965643,0
|
15 |
+
piqa,acc,0.7780195865070729,0.009696120744662019,0
|
16 |
+
piqa,acc_norm,0.7872687704026116,0.009548223123047352,0
|
17 |
+
rte,acc,0.5812274368231047,0.02969666108123484,0
|
18 |
+
sciq,acc,0.93,0.008072494358323499,0
|
19 |
+
sciq,acc_norm,0.91,0.009054390204866442,0
|
20 |
+
storycloze_2016,acc,0.7616247995724211,0.009853267441685421,0
|
21 |
+
winogrande,acc,0.6266771902131019,0.013594002763035516,0
|
8b7178b35b/evaluation/rankeval/8b7178b35b_2_lm-eval_global_step84877_2023-01-30-20-00-09_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.319,
|
5 |
-
"acc_stderr": 0.014746404865473486
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.33,
|
9 |
-
"acc_stderr": 0.01487687202745673
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.32,
|
13 |
-
"acc_stderr": 0.013471620929769142
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.3049446887911502
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.82,
|
22 |
-
"acc_stderr": 0.038612291966536955
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5259908384783908,
|
26 |
-
"acc_stderr": 0.004983035420235712,
|
27 |
-
"acc_norm": 0.7002589125672177,
|
28 |
-
"acc_norm_stderr": 0.004572081656965643
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5812274368231047,
|
32 |
-
"acc_stderr": 0.02969666108123484
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6266771902131019,
|
36 |
-
"acc_stderr": 0.013594002763035516
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7616247995724211,
|
40 |
-
"acc_stderr": 0.009853267441685421
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6590214067278287,
|
44 |
-
"acc_stderr": 0.00829097981816109
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6784511784511784,
|
48 |
-
"acc_stderr": 0.00958409157564062,
|
49 |
-
"acc_norm": 0.6607744107744108,
|
50 |
-
"acc_norm_stderr": 0.00971491720776585
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.33276450511945393,
|
54 |
-
"acc_stderr": 0.01376986304619231,
|
55 |
-
"acc_norm": 0.3643344709897611,
|
56 |
-
"acc_norm_stderr": 0.014063260279882415
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.93,
|
60 |
-
"acc_stderr": 0.008072494358323499,
|
61 |
-
"acc_norm": 0.91,
|
62 |
-
"acc_norm_stderr": 0.009054390204866442
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7780195865070729,
|
66 |
-
"acc_stderr": 0.009696120744662019,
|
67 |
-
"acc_norm": 0.7872687704026116,
|
68 |
-
"acc_norm_stderr": 0.009548223123047352
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b35b/evaluation/rankeval/8b7178b35b_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.322,0.014782913600996674,0
|
3 |
+
anli_r2,acc,0.332,0.014899597242811483,0
|
4 |
+
anli_r3,acc,0.3433333333333333,0.01371263383046586,0
|
5 |
+
arc_challenge,acc,0.3293515358361775,0.013734057652635473,0
|
6 |
+
arc_challenge,acc_norm,0.3515358361774744,0.013952413699600943,0
|
7 |
+
arc_easy,acc,0.6654040404040404,0.009682137724327909,0
|
8 |
+
arc_easy,acc_norm,0.6595117845117845,0.009723676813825861,0
|
9 |
+
boolq,acc,0.6461773700305811,0.008362983020904467,1
|
10 |
+
cb,acc,0.39285714285714285,0.0658538889806635,1
|
11 |
+
cb,f1,0.32857142857142857,,1
|
12 |
+
copa,acc,0.82,0.038612291966536955,0
|
13 |
+
hellaswag,acc,0.5291774546903008,0.004981278326428018,0
|
14 |
+
hellaswag,acc_norm,0.7015534754033061,0.004566412808642458,0
|
15 |
+
piqa,acc,0.7818280739934712,0.009636081958374381,0
|
16 |
+
piqa,acc_norm,0.7905331882480957,0.009494302979819806,0
|
17 |
+
rte,acc,0.6064981949458483,0.029405839314203198,0
|
18 |
+
sciq,acc,0.932,0.007964887911291605,0
|
19 |
+
sciq,acc_norm,0.918,0.008680515615523732,0
|
20 |
+
storycloze_2016,acc,0.7589524318546232,0.00989094649057694,0
|
21 |
+
winogrande,acc,0.6416732438831886,0.01347658117256753,0
|
8b7178b35b/evaluation/rankeval/8b7178b35b_3_lm-eval_global_step84877_2023-01-30-20-00-09_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.322,
|
5 |
-
"acc_stderr": 0.014782913600996674
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.332,
|
9 |
-
"acc_stderr": 0.014899597242811483
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3433333333333333,
|
13 |
-
"acc_stderr": 0.01371263383046586
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.32857142857142857
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.82,
|
22 |
-
"acc_stderr": 0.038612291966536955
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5291774546903008,
|
26 |
-
"acc_stderr": 0.004981278326428018,
|
27 |
-
"acc_norm": 0.7015534754033061,
|
28 |
-
"acc_norm_stderr": 0.004566412808642458
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.6064981949458483,
|
32 |
-
"acc_stderr": 0.029405839314203198
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6416732438831886,
|
36 |
-
"acc_stderr": 0.01347658117256753
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7589524318546232,
|
40 |
-
"acc_stderr": 0.00989094649057694
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6461773700305811,
|
44 |
-
"acc_stderr": 0.008362983020904467
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6654040404040404,
|
48 |
-
"acc_stderr": 0.009682137724327909,
|
49 |
-
"acc_norm": 0.6595117845117845,
|
50 |
-
"acc_norm_stderr": 0.009723676813825861
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3293515358361775,
|
54 |
-
"acc_stderr": 0.013734057652635473,
|
55 |
-
"acc_norm": 0.3515358361774744,
|
56 |
-
"acc_norm_stderr": 0.013952413699600943
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.932,
|
60 |
-
"acc_stderr": 0.007964887911291605,
|
61 |
-
"acc_norm": 0.918,
|
62 |
-
"acc_norm_stderr": 0.008680515615523732
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7818280739934712,
|
66 |
-
"acc_stderr": 0.009636081958374381,
|
67 |
-
"acc_norm": 0.7905331882480957,
|
68 |
-
"acc_norm_stderr": 0.009494302979819806
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b35b/evaluation/rankeval/8b7178b35b_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.317,0.01472167543888022,0
|
3 |
+
anli_r2,acc,0.337,0.014955087918653605,0
|
4 |
+
anli_r3,acc,0.33416666666666667,0.013622434813136783,0
|
5 |
+
arc_challenge,acc,0.3310580204778157,0.013752062419817836,0
|
6 |
+
arc_challenge,acc_norm,0.36689419795221845,0.014084133118104292,0
|
7 |
+
arc_easy,acc,0.6813973063973064,0.009560775507673366,0
|
8 |
+
arc_easy,acc_norm,0.6641414141414141,0.009691180932083508,0
|
9 |
+
boolq,acc,0.65565749235474,0.008310485054782981,1
|
10 |
+
cb,acc,0.4642857142857143,0.06724777654937658,1
|
11 |
+
cb,f1,0.38713450292397655,,1
|
12 |
+
copa,acc,0.83,0.03775251680686371,0
|
13 |
+
hellaswag,acc,0.5299741087432782,0.004980807231136743,0
|
14 |
+
hellaswag,acc_norm,0.7079267078271261,0.004537865171414025,0
|
15 |
+
piqa,acc,0.7845484221980413,0.009592463115658117,0
|
16 |
+
piqa,acc_norm,0.7899891186071817,0.009503353305818578,0
|
17 |
+
rte,acc,0.5884476534296029,0.029621832222417196,0
|
18 |
+
sciq,acc,0.939,0.007572076091557426,0
|
19 |
+
sciq,acc_norm,0.926,0.008282064512704156,0
|
20 |
+
storycloze_2016,acc,0.7659005879208979,0.009791868211495304,0
|
21 |
+
winogrande,acc,0.6353591160220995,0.013527746622429837,0
|
8b7178b35b/evaluation/rankeval/8b7178b35b_4_lm-eval_global_step84877_2023-01-30-20-00-09_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.317,
|
5 |
-
"acc_stderr": 0.01472167543888022
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.337,
|
9 |
-
"acc_stderr": 0.014955087918653605
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33416666666666667,
|
13 |
-
"acc_stderr": 0.013622434813136783
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.4642857142857143,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.38713450292397655
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.83,
|
22 |
-
"acc_stderr": 0.03775251680686371
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5299741087432782,
|
26 |
-
"acc_stderr": 0.004980807231136743,
|
27 |
-
"acc_norm": 0.7079267078271261,
|
28 |
-
"acc_norm_stderr": 0.004537865171414025
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5884476534296029,
|
32 |
-
"acc_stderr": 0.029621832222417196
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6353591160220995,
|
36 |
-
"acc_stderr": 0.013527746622429837
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7659005879208979,
|
40 |
-
"acc_stderr": 0.009791868211495304
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.65565749235474,
|
44 |
-
"acc_stderr": 0.008310485054782981
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6813973063973064,
|
48 |
-
"acc_stderr": 0.009560775507673366,
|
49 |
-
"acc_norm": 0.6641414141414141,
|
50 |
-
"acc_norm_stderr": 0.009691180932083508
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3310580204778157,
|
54 |
-
"acc_stderr": 0.013752062419817836,
|
55 |
-
"acc_norm": 0.36689419795221845,
|
56 |
-
"acc_norm_stderr": 0.014084133118104292
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.939,
|
60 |
-
"acc_stderr": 0.007572076091557426,
|
61 |
-
"acc_norm": 0.926,
|
62 |
-
"acc_norm_stderr": 0.008282064512704156
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7845484221980413,
|
66 |
-
"acc_stderr": 0.009592463115658117,
|
67 |
-
"acc_norm": 0.7899891186071817,
|
68 |
-
"acc_norm_stderr": 0.009503353305818578
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b35b/evaluation/rankeval/8b7178b35b_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.328,0.014853842487270334,0
|
3 |
+
anli_r2,acc,0.321,0.01477082181793464,0
|
4 |
+
anli_r3,acc,0.33416666666666667,0.013622434813136783,0
|
5 |
+
arc_challenge,acc,0.3455631399317406,0.01389693846114569,0
|
6 |
+
arc_challenge,acc_norm,0.3583617747440273,0.014012883334859859,0
|
7 |
+
arc_easy,acc,0.6914983164983165,0.009477472342978122,0
|
8 |
+
arc_easy,acc_norm,0.6734006734006734,0.009623047038267656,0
|
9 |
+
boolq,acc,0.6584097859327217,0.008294560677768487,1
|
10 |
+
cb,acc,0.42857142857142855,0.06672848092813058,1
|
11 |
+
cb,f1,0.30424242424242426,,1
|
12 |
+
copa,acc,0.82,0.038612291966536955,0
|
13 |
+
hellaswag,acc,0.528779127663812,0.004981509099276353,0
|
14 |
+
hellaswag,acc_norm,0.7054371639115714,0.004549143750428458,0
|
15 |
+
piqa,acc,0.7763873775843307,0.009721489519176289,0
|
16 |
+
piqa,acc_norm,0.7872687704026116,0.009548223123047346,0
|
17 |
+
rte,acc,0.5992779783393501,0.029497229237163147,0
|
18 |
+
sciq,acc,0.931,0.008018934050315157,0
|
19 |
+
sciq,acc_norm,0.923,0.008434580140240634,0
|
20 |
+
storycloze_2016,acc,0.7594869053981828,0.009883453084862687,0
|
21 |
+
winogrande,acc,0.6464088397790055,0.013436541262599955,0
|
8b7178b35b/evaluation/rankeval/8b7178b35b_5_lm-eval_global_step84877_2023-01-30-20-00-09_5shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.328,
|
5 |
-
"acc_stderr": 0.014853842487270334
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.321,
|
9 |
-
"acc_stderr": 0.01477082181793464
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33416666666666667,
|
13 |
-
"acc_stderr": 0.013622434813136783
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.30424242424242426
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.82,
|
22 |
-
"acc_stderr": 0.038612291966536955
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.528779127663812,
|
26 |
-
"acc_stderr": 0.004981509099276353,
|
27 |
-
"acc_norm": 0.7054371639115714,
|
28 |
-
"acc_norm_stderr": 0.004549143750428458
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5992779783393501,
|
32 |
-
"acc_stderr": 0.029497229237163147
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6464088397790055,
|
36 |
-
"acc_stderr": 0.013436541262599955
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7594869053981828,
|
40 |
-
"acc_stderr": 0.009883453084862687
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6584097859327217,
|
44 |
-
"acc_stderr": 0.008294560677768487
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6914983164983165,
|
48 |
-
"acc_stderr": 0.009477472342978122,
|
49 |
-
"acc_norm": 0.6734006734006734,
|
50 |
-
"acc_norm_stderr": 0.009623047038267656
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3455631399317406,
|
54 |
-
"acc_stderr": 0.01389693846114569,
|
55 |
-
"acc_norm": 0.3583617747440273,
|
56 |
-
"acc_norm_stderr": 0.014012883334859859
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.931,
|
60 |
-
"acc_stderr": 0.008018934050315157,
|
61 |
-
"acc_norm": 0.923,
|
62 |
-
"acc_norm_stderr": 0.008434580140240634
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7763873775843307,
|
66 |
-
"acc_stderr": 0.009721489519176289,
|
67 |
-
"acc_norm": 0.7872687704026116,
|
68 |
-
"acc_norm_stderr": 0.009548223123047346
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b44b/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.08140776012327813
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.08140776012327813
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.19963881437447786
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.19963881437447786
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.22343026823022358
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.22343026823022358
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2299932736088538
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2299932736088538
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.231134956488486
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.231134956488486
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.23283624336459657
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.23283624336459657
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.19974021936498598
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05370104596286604
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.05370104596286604
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04598183981683866
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.04598183981683866
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05795467773425469
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.05795467773425469
|
21 |
+
gem_xsum,2,average,multiple,0.05254585450465313
|
22 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04887042562262386
|
23 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.04887042562262386
|
24 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05575099241882265
|
25 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.05575099241882265
|
26 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.055072070164922673
|
27 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.055072070164922673
|
28 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05572250969924929
|
29 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.05572250969924929
|
30 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05965148393261353
|
31 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.05965148393261353
|
32 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.061471018231031564
|
33 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.061471018231031564
|
34 |
+
web_nlg_en,5,average,multiple,0.056089750011543926
|
35 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04216588190254503
|
36 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.04216588190254503
|
37 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06178715930121975
|
38 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.06178715930121975
|
39 |
+
wiki_lingua_en,1,average,multiple,0.05197652060188239
|
8b7178b44b/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2975989706476116, "bleu_stderr": 0.04003039263082336, "rouge1_fmeasure": 0.10454253577265941, "rouge1_fmeasure_stderr": 0.0020033273254233754, "rouge1_precision": 0.07036951453310922, "rouge1_precision_stderr": 0.001708454486390292, "rouge1_recall": 0.28391423378552716, "rouge1_recall_stderr": 0.004406605071545494, "rouge2_fmeasure": 0.04887042562262386, "rouge2_fmeasure_stderr": 0.0012367653733248375, "rouge2_precision": 0.03205700906292, "rouge2_precision_stderr": 0.0008904068747417852, "rouge2_recall": 0.1344860970658841, "rouge2_recall_stderr": 0.0029699527724654008, "rougeL_fmeasure": 0.09984712791182629, "rougeL_fmeasure_stderr": 0.001864385700385995, "rougeL_precision": 0.06700444340435573, "rougeL_precision_stderr": 0.001577635401560107, "rougeL_recall": 0.2723165283491304, "rougeL_recall_stderr": 0.00420334764816495, "rougeLsum_fmeasure": 0.09986723049046536, "rougeLsum_fmeasure_stderr": 0.0018921794146755414, "rougeLsum_precision": 0.06714085166592883, "rougeLsum_precision_stderr": 0.0016058922247288731, "rougeLsum_recall": 0.2710598887897389, "rougeLsum_recall_stderr": 0.004143182286682885}}, "1": {"PALM_prompt": {"bleu": 0.5399102183044338, "bleu_stderr": 0.017964459975753112, "rouge1_fmeasure": 0.1179994472699294, "rouge1_fmeasure_stderr": 0.001898574018978891, "rouge1_precision": 0.07582800253640251, "rouge1_precision_stderr": 0.0014775645988873703, "rouge1_recall": 0.3887660352664638, "rouge1_recall_stderr": 0.005510916338178581, "rouge2_fmeasure": 0.05575099241882265, "rouge2_fmeasure_stderr": 0.0011943826599908319, "rouge2_precision": 0.03547782475927522, "rouge2_precision_stderr": 0.000840851630406, "rouge2_recall": 0.19434244564750444, "rouge2_recall_stderr": 0.0039041652957777923, "rougeL_fmeasure": 0.10987367600038657, "rougeL_fmeasure_stderr": 0.0017019944711869266, "rougeL_precision": 0.07062905559753234, "rougeL_precision_stderr": 0.0013389008252074547, "rougeL_recall": 0.3594853678419995, "rougeL_recall_stderr": 0.004899885542632218, "rougeLsum_fmeasure": 0.11246764238131458, "rougeLsum_fmeasure_stderr": 0.001788765887772429, "rougeLsum_precision": 0.07235075590060619, "rougeLsum_precision_stderr": 0.0014035297849105085, "rougeLsum_recall": 0.36849778247126685, "rougeLsum_recall_stderr": 0.005075094016447408}}, "2": {"PALM_prompt": {"bleu": 0.6132285241911561, "bleu_stderr": 0.021744492816193794, "rouge1_fmeasure": 0.11750948362933708, "rouge1_fmeasure_stderr": 0.0017443663495669947, "rouge1_precision": 0.07454923688966983, "rouge1_precision_stderr": 0.0013402782053761812, "rouge1_recall": 0.4060696737056673, "rouge1_recall_stderr": 0.005408951548341353, "rouge2_fmeasure": 0.055072070164922673, "rouge2_fmeasure_stderr": 0.0011249794411084804, "rouge2_precision": 0.034783751809391424, "rouge2_precision_stderr": 0.0008624538099408842, "rouge2_recall": 0.20529500851088617, "rouge2_recall_stderr": 0.004029599291312284, "rougeL_fmeasure": 0.1081453556034095, "rougeL_fmeasure_stderr": 0.0015836947383010406, "rougeL_precision": 0.06870503701115688, "rougeL_precision_stderr": 0.001214108724498691, "rougeL_recall": 0.36958240896722416, "rougeL_recall_stderr": 0.004690608909122334, "rougeLsum_fmeasure": 0.11165202694972098, "rougeLsum_fmeasure_stderr": 0.0016570784342064698, "rougeLsum_precision": 0.07091032443010714, "rougeLsum_precision_stderr": 0.0012648199327180544, "rougeLsum_recall": 0.383427293902694, "rougeLsum_recall_stderr": 0.004986353289044804}}, "3": {"PALM_prompt": {"bleu": 0.6689911149446237, "bleu_stderr": 0.03744306343722813, "rouge1_fmeasure": 0.1180307283191865, "rouge1_fmeasure_stderr": 0.0017866703950125558, "rouge1_precision": 0.07481052377527687, "rouge1_precision_stderr": 0.0013081535798312271, "rouge1_recall": 0.4066681705698085, "rouge1_recall_stderr": 0.0054019768500427, "rouge2_fmeasure": 0.05572250969924929, "rouge2_fmeasure_stderr": 0.0011431197746402597, "rouge2_precision": 0.03499189343685807, "rouge2_precision_stderr": 0.0007904982370484803, "rouge2_recall": 0.20736527087710988, "rouge2_recall_stderr": 0.0040148188536021905, "rougeL_fmeasure": 0.10739194121096401, "rougeL_fmeasure_stderr": 0.0015945105292757362, "rougeL_precision": 0.06809194203501431, "rougeL_precision_stderr": 0.0011605273645199663, "rougeL_recall": 0.36740563590690617, "rougeL_recall_stderr": 0.004685196144542859, "rougeLsum_fmeasure": 0.11166569262354312, "rougeLsum_fmeasure_stderr": 0.001692220078605611, "rougeLsum_precision": 0.07085369656177957, "rougeLsum_precision_stderr": 0.0012407868939625576, "rougeLsum_recall": 0.38310128345463584, "rougeLsum_recall_stderr": 0.004987172857446567}}, "4": {"PALM_prompt": {"bleu": 0.7331295013237835, "bleu_stderr": 0.0376308794527232, "rouge1_fmeasure": 0.12533762675953772, "rouge1_fmeasure_stderr": 0.0018878485389189842, "rouge1_precision": 0.07979502355783509, "rouge1_precision_stderr": 0.0014030109393831616, "rouge1_recall": 0.4235816766130294, "rouge1_recall_stderr": 0.005421570401845218, "rouge2_fmeasure": 0.05965148393261353, "rouge2_fmeasure_stderr": 0.0011811051795035776, "rouge2_precision": 0.037526756160207154, "rouge2_precision_stderr": 0.0008206560892571376, "rouge2_recall": 0.21978995547688931, "rouge2_recall_stderr": 0.004158496007987482, "rougeL_fmeasure": 0.1127973185683369, "rougeL_fmeasure_stderr": 0.0016137599087464715, "rougeL_precision": 0.07173000592003134, "rougeL_precision_stderr": 0.001189659942487538, "rougeL_recall": 0.3800311917113359, "rougeL_recall_stderr": 0.004634022951095827, "rougeLsum_fmeasure": 0.1188731408826614, "rougeLsum_fmeasure_stderr": 0.001769238872468489, "rougeLsum_precision": 0.07571045481706781, "rougeLsum_precision_stderr": 0.0013130071112336225, "rougeLsum_recall": 0.3998599380714254, "rougeLsum_recall_stderr": 0.004996013261269132}}, "5": {"PALM_prompt": {"bleu": 0.7199803964010689, "bleu_stderr": 0.0335341295220522, "rouge1_fmeasure": 0.12957731808842543, "rouge1_fmeasure_stderr": 0.001950385050411022, "rouge1_precision": 0.0831998254284886, "rouge1_precision_stderr": 0.0015598094416264543, "rouge1_recall": 0.43497959954822596, "rouge1_recall_stderr": 0.005519903741092491, "rouge2_fmeasure": 0.061471018231031564, "rouge2_fmeasure_stderr": 0.0011954291401081725, "rouge2_precision": 0.038915148561977614, "rouge2_precision_stderr": 0.0008746684971485049, "rouge2_recall": 0.22682184765709848, "rouge2_recall_stderr": 0.004248572295542693, "rougeL_fmeasure": 0.11444599253148456, "rougeL_fmeasure_stderr": 0.0016146327759809166, "rougeL_precision": 0.07313294781649872, "rougeL_precision_stderr": 0.0012487980616292505, "rougeL_recall": 0.3869620252624607, "rougeL_recall_stderr": 0.00472547888254755, "rougeLsum_fmeasure": 0.12182978494965954, "rougeLsum_fmeasure_stderr": 0.0017943723258623825, "rougeLsum_precision": 0.07806446348136394, "rougeLsum_precision_stderr": 0.0013916456734416912, "rougeLsum_recall": 0.4091496557022353, "rougeLsum_recall_stderr": 0.005082415980961682}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.953242244453973, "bleu_stderr": 0.05731680211281534, "rouge1_fmeasure": 0.19037826199272892, "rouge1_fmeasure_stderr": 0.0019330255590741413, "rouge1_precision": 0.16219138852188794, "rouge1_precision_stderr": 0.001952170268256085, "rouge1_recall": 0.27855214789998106, "rouge1_recall_stderr": 0.0028661354548397203, "rouge2_fmeasure": 0.04216588190254503, "rouge2_fmeasure_stderr": 0.0009553783013135788, "rouge2_precision": 0.0354263621636146, "rouge2_precision_stderr": 0.0008399737677670994, "rouge2_recall": 0.06484436785581019, "rouge2_recall_stderr": 0.001614050776584332, "rougeL_fmeasure": 0.1448489434443541, "rougeL_fmeasure_stderr": 0.001361425790826035, "rougeL_precision": 0.12175563950858005, "rougeL_precision_stderr": 0.0013221666940557008, "rougeL_recall": 0.21752194752302395, "rougeL_recall_stderr": 0.0023088753932079594, "rougeLsum_fmeasure": 0.1758237056132223, "rougeLsum_fmeasure_stderr": 0.0017714351825610913, "rougeLsum_precision": 0.14956126999475367, "rougeLsum_precision_stderr": 0.0017821025582316077, "rougeLsum_recall": 0.2582733333610139, "rougeLsum_recall_stderr": 0.0026810100132315094}}, "1": {"tldr_en": {"bleu": 3.1938176079049834, "bleu_stderr": 0.06758802228266159, "rouge1_fmeasure": 0.23210095477457815, "rouge1_fmeasure_stderr": 0.0019831122653105916, "rouge1_precision": 0.21449004123956839, "rouge1_precision_stderr": 0.002565225956613692, "rouge1_recall": 0.3240170155957018, "rouge1_recall_stderr": 0.002854256640412515, "rouge2_fmeasure": 0.06178715930121975, "rouge2_fmeasure_stderr": 0.0011277206664131291, "rouge2_precision": 0.05838461705053636, "rouge2_precision_stderr": 0.0014235851964682594, "rouge2_recall": 0.08878615660224992, "rouge2_recall_stderr": 0.0017942884481548583, "rougeL_fmeasure": 0.16714787573615286, "rougeL_fmeasure_stderr": 0.001398161530856219, "rougeL_precision": 0.15474204497973576, "rougeL_precision_stderr": 0.001972497669394072, "rougeL_recall": 0.23823572295131598, "rougeL_recall_stderr": 0.0022819725517357744, "rougeLsum_fmeasure": 0.2194764899179448, "rougeLsum_fmeasure_stderr": 0.0018724396931342306, "rougeLsum_precision": 0.20262040584770197, "rougeLsum_precision_stderr": 0.0024273351077249345, "rougeLsum_recall": 0.3070782317130203, "rougeLsum_recall_stderr": 0.0027244549656214464}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 5.041894180834452, "bleu_stderr": 0.13033007436218397, "rouge1_fmeasure": 0.2602520157352752, "rouge1_fmeasure_stderr": 0.0019814546233520676, "rouge1_precision": 0.2562147385675977, "rouge1_precision_stderr": 0.0023058315204108905, "rouge1_recall": 0.30315886913634216, "rouge1_recall_stderr": 0.0026139295974695565, "rouge2_fmeasure": 0.08140776012327813, "rouge2_fmeasure_stderr": 0.001311313047394141, "rouge2_precision": 0.07721561798611194, "rouge2_precision_stderr": 0.0012074751318368925, "rouge2_recall": 0.09865589414861335, "rouge2_recall_stderr": 0.0018266381810205243, "rougeL_fmeasure": 0.2348805493930538, "rougeL_fmeasure_stderr": 0.0016535155281064092, "rougeL_precision": 0.23076758584429627, "rougeL_precision_stderr": 0.0019576805274118237, "rougeL_recall": 0.27459675434984065, "rougeL_recall_stderr": 0.002244077056891058, "rougeLsum_fmeasure": 0.2231769159790997, "rougeLsum_fmeasure_stderr": 0.0018450572724071957, "rougeLsum_precision": 0.2179856280727458, "rougeLsum_precision_stderr": 0.0019996113997260653, "rougeLsum_recall": 0.26255348104591614, "rougeLsum_recall_stderr": 0.002535270034294405}}, "1": {"generate_text_restaurant": {"bleu": 11.374090854418796, "bleu_stderr": 0.16642693339608366, "rouge1_fmeasure": 0.44441293942301124, "rouge1_fmeasure_stderr": 0.0019990822428693255, "rouge1_precision": 0.4562660071544534, "rouge1_precision_stderr": 0.0023247072186540372, "rouge1_recall": 0.47023523973208886, "rouge1_recall_stderr": 0.0029204957902967815, "rouge2_fmeasure": 0.19963881437447786, "rouge2_fmeasure_stderr": 0.001763576201025895, "rouge2_precision": 0.20466887595780073, "rouge2_precision_stderr": 0.0018785112726430648, "rouge2_recall": 0.21306332737215863, "rouge2_recall_stderr": 0.0021659970991227857, "rougeL_fmeasure": 0.31715680141114483, "rougeL_fmeasure_stderr": 0.0017221275024008205, "rougeL_precision": 0.32653768131528194, "rougeL_precision_stderr": 0.001990750084311469, "rougeL_recall": 0.3355433404446685, "rougeL_recall_stderr": 0.0023788838500677285, "rougeLsum_fmeasure": 0.37193579102313756, "rougeLsum_fmeasure_stderr": 0.0020241065034827393, "rougeLsum_precision": 0.3824555027115406, "rougeLsum_precision_stderr": 0.0023079179899071546, "rougeLsum_recall": 0.39314046484640197, "rougeLsum_recall_stderr": 0.0027333955271206228}}, "2": {"generate_text_restaurant": {"bleu": 12.560644285364408, "bleu_stderr": 0.16815632193227215, "rouge1_fmeasure": 0.471505067248124, "rouge1_fmeasure_stderr": 0.0019357830388836816, "rouge1_precision": 0.4693114735453705, "rouge1_precision_stderr": 0.002281004588256248, "rouge1_recall": 0.5083374783023907, "rouge1_recall_stderr": 0.0027945657189866707, "rouge2_fmeasure": 0.22343026823022358, "rouge2_fmeasure_stderr": 0.0018374594088211456, "rouge2_precision": 0.2221316806859365, "rouge2_precision_stderr": 0.0019435645476744553, "rouge2_recall": 0.24313554807879642, "rouge2_recall_stderr": 0.002262346807934513, "rougeL_fmeasure": 0.34007419588321747, "rougeL_fmeasure_stderr": 0.0017661692599286885, "rougeL_precision": 0.33868884919458336, "rougeL_precision_stderr": 0.0020005248626700075, "rougeL_recall": 0.36715355671289307, "rougeL_recall_stderr": 0.002393826810405225, "rougeLsum_fmeasure": 0.3992367697974597, "rougeLsum_fmeasure_stderr": 0.002047758775999973, "rougeLsum_precision": 0.39764801002400024, "rougeLsum_precision_stderr": 0.0023126735358181896, "rougeLsum_recall": 0.4301763419894379, "rougeLsum_recall_stderr": 0.002704714026602679}}, "3": {"generate_text_restaurant": {"bleu": 12.935887027476703, "bleu_stderr": 0.1413974704174758, "rouge1_fmeasure": 0.47748617274088134, "rouge1_fmeasure_stderr": 0.0019432464320825518, "rouge1_precision": 0.46877980581573947, "rouge1_precision_stderr": 0.002306244616020727, "rouge1_recall": 0.5201076534915446, "rouge1_recall_stderr": 0.0027589098635390622, "rouge2_fmeasure": 0.2299932736088538, "rouge2_fmeasure_stderr": 0.0018542116692296976, "rouge2_precision": 0.22504131267180502, "rouge2_precision_stderr": 0.0019096188553057005, "rouge2_recall": 0.25334102505668565, "rouge2_recall_stderr": 0.0023303517782663806, "rougeL_fmeasure": 0.3453328850410215, "rougeL_fmeasure_stderr": 0.0017788674272907377, "rougeL_precision": 0.33848754169654177, "rougeL_precision_stderr": 0.0019586798135973788, "rougeL_recall": 0.3774849420782741, "rougeL_recall_stderr": 0.002448077823046622, "rougeLsum_fmeasure": 0.4054179095581331, "rougeLsum_fmeasure_stderr": 0.002057211964943938, "rougeLsum_precision": 0.398001467577397, "rougeLsum_precision_stderr": 0.002302044463314153, "rougeLsum_recall": 0.44176973604863884, "rougeLsum_recall_stderr": 0.002731272765488879}}, "4": {"generate_text_restaurant": {"bleu": 13.062203352639571, "bleu_stderr": 0.1443635676836731, "rouge1_fmeasure": 0.480197984035446, "rouge1_fmeasure_stderr": 0.0019374104232555646, "rouge1_precision": 0.4681188396235994, "rouge1_precision_stderr": 0.0022953424581571586, "rouge1_recall": 0.5257893103591965, "rouge1_recall_stderr": 0.0027113760261248936, "rouge2_fmeasure": 0.231134956488486, "rouge2_fmeasure_stderr": 0.0018720751391330711, "rouge2_precision": 0.2248618704612403, "rouge2_precision_stderr": 0.001945938539895268, "rouge2_recall": 0.2557332161082852, "rouge2_recall_stderr": 0.0023207877819305836, "rougeL_fmeasure": 0.34575847531397613, "rougeL_fmeasure_stderr": 0.0017883470089669242, "rougeL_precision": 0.33632345011254977, "rougeL_precision_stderr": 0.0019454027156019396, "rougeL_recall": 0.38013195807046607, "rougeL_recall_stderr": 0.00244327641775733, "rougeLsum_fmeasure": 0.4073959730873966, "rougeLsum_fmeasure_stderr": 0.002067697322138944, "rougeLsum_precision": 0.39692021210935596, "rougeLsum_precision_stderr": 0.0022989702493060767, "rougeLsum_recall": 0.44638101481881887, "rougeLsum_recall_stderr": 0.002709781160431362}}, "5": {"generate_text_restaurant": {"bleu": 13.007507775940796, "bleu_stderr": 0.15953149423790455, "rouge1_fmeasure": 0.4808919059561077, "rouge1_fmeasure_stderr": 0.001952787403665255, "rouge1_precision": 0.46770150316298764, "rouge1_precision_stderr": 0.0022797897877814698, "rouge1_recall": 0.5268279632857007, "rouge1_recall_stderr": 0.0027460682371807316, "rouge2_fmeasure": 0.23283624336459657, "rouge2_fmeasure_stderr": 0.0018731934526011642, "rouge2_precision": 0.22559527103944196, "rouge2_precision_stderr": 0.0019147802508556583, "rouge2_recall": 0.2579313360551791, "rouge2_recall_stderr": 0.00233240570767885, "rougeL_fmeasure": 0.3483900918258692, "rougeL_fmeasure_stderr": 0.0018126238683006549, "rougeL_precision": 0.33809291109163425, "rougeL_precision_stderr": 0.0019479891358002816, "rougeL_recall": 0.3830487611318609, "rougeL_recall_stderr": 0.0024664916023592687, "rougeLsum_fmeasure": 0.40962759438618196, "rougeLsum_fmeasure_stderr": 0.002070580879609581, "rougeLsum_precision": 0.3985001878422045, "rougeLsum_precision_stderr": 0.002294921272754354, "rougeLsum_recall": 0.4486089837026324, "rougeLsum_recall_stderr": 0.002709717530773792}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.202159785681064, "bleu_stderr": 0.1019905127598651, "rouge1_fmeasure": 0.21747292414186656, "rouge1_fmeasure_stderr": 0.0025362641546465615, "rouge1_precision": 0.15620442016026878, "rouge1_precision_stderr": 0.0019393097407504807, "rouge1_recall": 0.37637904496982905, "rouge1_recall_stderr": 0.004428769631922407, "rouge2_fmeasure": 0.05370104596286604, "rouge2_fmeasure_stderr": 0.001644696030213316, "rouge2_precision": 0.03798752030386553, "rouge2_precision_stderr": 0.001174899351649945, "rouge2_recall": 0.09645314548431894, "rouge2_recall_stderr": 0.0030269539528850957, "rougeL_fmeasure": 0.16392696498450032, "rougeL_fmeasure_stderr": 0.0019387441418856352, "rougeL_precision": 0.11750264664821597, "rougeL_precision_stderr": 0.001455192782942557, "rougeL_recall": 0.28553565213661397, "rougeL_recall_stderr": 0.003571545700265264, "rougeLsum_fmeasure": 0.1729737886084137, "rougeLsum_fmeasure_stderr": 0.0021734952347434375, "rougeLsum_precision": 0.12392072701085288, "rougeLsum_precision_stderr": 0.0016211314235083207, "rougeLsum_recall": 0.30146747797091933, "rougeLsum_recall_stderr": 0.003959575791577483}}, "1": {"article_DOC_summary": {"bleu": 1.9249412744961178, "bleu_stderr": 0.12178209771806298, "rouge1_fmeasure": 0.20067154816406912, "rouge1_fmeasure_stderr": 0.0026898597422456614, "rouge1_precision": 0.14293573080985697, "rouge1_precision_stderr": 0.0020020715732026848, "rouge1_recall": 0.3507450257481026, "rouge1_recall_stderr": 0.004639905013718798, "rouge2_fmeasure": 0.04598183981683866, "rouge2_fmeasure_stderr": 0.001649351483193518, "rouge2_precision": 0.0324308871516594, "rouge2_precision_stderr": 0.0011682005693517872, "rouge2_recall": 0.08264816662579652, "rouge2_recall_stderr": 0.003080246882800617, "rougeL_fmeasure": 0.15118274474995874, "rougeL_fmeasure_stderr": 0.001994655397896864, "rougeL_precision": 0.10746501641700831, "rougeL_precision_stderr": 0.0014694032015050793, "rougeL_recall": 0.2659219914279442, "rougeL_recall_stderr": 0.0035904663684219085, "rougeLsum_fmeasure": 0.160999424649348, "rougeLsum_fmeasure_stderr": 0.002240087025674008, "rougeLsum_precision": 0.11442553643678831, "rougeLsum_precision_stderr": 0.0016427072257106192, "rougeLsum_recall": 0.28320972201688277, "rougeLsum_recall_stderr": 0.00404579631504026}}, "2": {"article_DOC_summary": {"bleu": 2.354378541611583, "bleu_stderr": 0.11647407253425891, "rouge1_fmeasure": 0.22162196546785315, "rouge1_fmeasure_stderr": 0.0026821013380755065, "rouge1_precision": 0.15803061677869468, "rouge1_precision_stderr": 0.0020134830696427186, "rouge1_recall": 0.38606737885839715, "rouge1_recall_stderr": 0.0045727909552794035, "rouge2_fmeasure": 0.05795467773425469, "rouge2_fmeasure_stderr": 0.001726378828967632, "rouge2_precision": 0.04089910513966374, "rouge2_precision_stderr": 0.0012276135124666848, "rouge2_recall": 0.10377696105882783, "rouge2_recall_stderr": 0.0031669949816395633, "rougeL_fmeasure": 0.16710610962820924, "rougeL_fmeasure_stderr": 0.0020561292357138584, "rougeL_precision": 0.11895190107092266, "rougeL_precision_stderr": 0.001522240640457265, "rougeL_recall": 0.29275028181844964, "rougeL_recall_stderr": 0.0036724085051555028, "rougeLsum_fmeasure": 0.17719218883183147, "rougeLsum_fmeasure_stderr": 0.002316006750830736, "rougeLsum_precision": 0.12607152034389388, "rougeLsum_precision_stderr": 0.0017074031321102738, "rougeLsum_recall": 0.31057163498399204, "rougeLsum_recall_stderr": 0.0041042123336552}}}}
|
8b7178b44b/evaluation/rankeval/8b7178b44b_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.343,0.015019206922356953,0
|
3 |
+
anli_r2,acc,0.345,0.015039986742055233,0
|
4 |
+
anli_r3,acc,0.3525,0.013797164918918359,0
|
5 |
+
arc_challenge,acc,0.32081911262798635,0.01364094309194652,0
|
6 |
+
arc_challenge,acc_norm,0.3370307167235495,0.013813476652902267,0
|
7 |
+
arc_easy,acc,0.6675084175084175,0.009666892606130122,0
|
8 |
+
arc_easy,acc_norm,0.585016835016835,0.010110383151961139,0
|
9 |
+
boolq,acc,0.6311926605504588,0.008438656079759075,1
|
10 |
+
cb,acc,0.375,0.06527912098338669,1
|
11 |
+
cb,f1,0.1986111111111111,,1
|
12 |
+
copa,acc,0.81,0.03942772444036623,0
|
13 |
+
hellaswag,acc,0.5324636526588329,0.004979252954977319,0
|
14 |
+
hellaswag,acc_norm,0.7038438558056164,0.004556276293751941,0
|
15 |
+
piqa,acc,0.780195865070729,0.009661958616651764,0
|
16 |
+
piqa,acc_norm,0.7894450489662677,0.00951237808123874,0
|
17 |
+
rte,acc,0.5884476534296029,0.0296218322224172,0
|
18 |
+
sciq,acc,0.893,0.009779910359847169,0
|
19 |
+
sciq,acc_norm,0.826,0.011994493230973421,0
|
20 |
+
storycloze_2016,acc,0.7573490112239444,0.009913300265342056,0
|
21 |
+
winogrande,acc,0.6337805840568271,0.0135401443765889,0
|
8b7178b44b/evaluation/rankeval/8b7178b44b_0_lm-eval_global_step84877_2023-01-31-11-38-06_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.343,
|
5 |
-
"acc_stderr": 0.015019206922356953
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.345,
|
9 |
-
"acc_stderr": 0.015039986742055233
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3525,
|
13 |
-
"acc_stderr": 0.013797164918918359
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.375,
|
17 |
-
"acc_stderr": 0.06527912098338669,
|
18 |
-
"f1": 0.1986111111111111
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.81,
|
22 |
-
"acc_stderr": 0.03942772444036623
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5324636526588329,
|
26 |
-
"acc_stderr": 0.004979252954977319,
|
27 |
-
"acc_norm": 0.7038438558056164,
|
28 |
-
"acc_norm_stderr": 0.004556276293751941
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5884476534296029,
|
32 |
-
"acc_stderr": 0.0296218322224172
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6337805840568271,
|
36 |
-
"acc_stderr": 0.0135401443765889
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7573490112239444,
|
40 |
-
"acc_stderr": 0.009913300265342056
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6311926605504588,
|
44 |
-
"acc_stderr": 0.008438656079759075
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6675084175084175,
|
48 |
-
"acc_stderr": 0.009666892606130122,
|
49 |
-
"acc_norm": 0.585016835016835,
|
50 |
-
"acc_norm_stderr": 0.010110383151961139
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.32081911262798635,
|
54 |
-
"acc_stderr": 0.01364094309194652,
|
55 |
-
"acc_norm": 0.3370307167235495,
|
56 |
-
"acc_norm_stderr": 0.013813476652902267
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.893,
|
60 |
-
"acc_stderr": 0.009779910359847169,
|
61 |
-
"acc_norm": 0.826,
|
62 |
-
"acc_norm_stderr": 0.011994493230973421
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.780195865070729,
|
66 |
-
"acc_stderr": 0.009661958616651764,
|
67 |
-
"acc_norm": 0.7894450489662677,
|
68 |
-
"acc_norm_stderr": 0.00951237808123874
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b44b/evaluation/rankeval/8b7178b44b_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.33,0.014876872027456734,0
|
3 |
+
anli_r2,acc,0.314,0.014683991951087967,0
|
4 |
+
anli_r3,acc,0.3675,0.013923529685359282,0
|
5 |
+
arc_challenge,acc,0.3319112627986348,0.013760988200880536,0
|
6 |
+
arc_challenge,acc_norm,0.3515358361774744,0.013952413699600943,0
|
7 |
+
arc_easy,acc,0.6767676767676768,0.009597218642045324,0
|
8 |
+
arc_easy,acc_norm,0.6439393939393939,0.009825454608416304,0
|
9 |
+
boolq,acc,0.6406727828746177,0.00839181177040674,1
|
10 |
+
cb,acc,0.48214285714285715,0.0673769750864465,1
|
11 |
+
cb,f1,0.36000000000000004,,1
|
12 |
+
copa,acc,0.76,0.04292346959909283,0
|
13 |
+
hellaswag,acc,0.529874526986656,0.004980866814462756,0
|
14 |
+
hellaswag,acc_norm,0.7042421828321052,0.004554499409290722,0
|
15 |
+
piqa,acc,0.7823721436343852,0.009627407474840878,0
|
16 |
+
piqa,acc_norm,0.7861806311207835,0.009565994206915606,0
|
17 |
+
rte,acc,0.5379061371841155,0.030009848912529113,0
|
18 |
+
sciq,acc,0.919,0.008632121032139985,0
|
19 |
+
sciq,acc_norm,0.915,0.00882342636694232,0
|
20 |
+
storycloze_2016,acc,0.7514698022447889,0.009993659448666372,0
|
21 |
+
winogrande,acc,0.6385161799526441,0.013502479670791285,0
|
8b7178b44b/evaluation/rankeval/8b7178b44b_1_lm-eval_global_step84877_2023-01-31-11-38-06_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.33,
|
5 |
-
"acc_stderr": 0.014876872027456734
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.314,
|
9 |
-
"acc_stderr": 0.014683991951087967
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3675,
|
13 |
-
"acc_stderr": 0.013923529685359282
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.48214285714285715,
|
17 |
-
"acc_stderr": 0.0673769750864465,
|
18 |
-
"f1": 0.36000000000000004
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.76,
|
22 |
-
"acc_stderr": 0.04292346959909283
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.529874526986656,
|
26 |
-
"acc_stderr": 0.004980866814462756,
|
27 |
-
"acc_norm": 0.7042421828321052,
|
28 |
-
"acc_norm_stderr": 0.004554499409290722
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5379061371841155,
|
32 |
-
"acc_stderr": 0.030009848912529113
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6385161799526441,
|
36 |
-
"acc_stderr": 0.013502479670791285
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7514698022447889,
|
40 |
-
"acc_stderr": 0.009993659448666372
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6406727828746177,
|
44 |
-
"acc_stderr": 0.00839181177040674
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6767676767676768,
|
48 |
-
"acc_stderr": 0.009597218642045324,
|
49 |
-
"acc_norm": 0.6439393939393939,
|
50 |
-
"acc_norm_stderr": 0.009825454608416304
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3319112627986348,
|
54 |
-
"acc_stderr": 0.013760988200880536,
|
55 |
-
"acc_norm": 0.3515358361774744,
|
56 |
-
"acc_norm_stderr": 0.013952413699600943
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.919,
|
60 |
-
"acc_stderr": 0.008632121032139985,
|
61 |
-
"acc_norm": 0.915,
|
62 |
-
"acc_norm_stderr": 0.00882342636694232
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7823721436343852,
|
66 |
-
"acc_stderr": 0.009627407474840878,
|
67 |
-
"acc_norm": 0.7861806311207835,
|
68 |
-
"acc_norm_stderr": 0.009565994206915606
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b44b/evaluation/rankeval/8b7178b44b_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.32,0.01475865230357487,0
|
3 |
+
anli_r2,acc,0.335,0.014933117490932566,0
|
4 |
+
anli_r3,acc,0.33916666666666667,0.013672343491681812,0
|
5 |
+
arc_challenge,acc,0.3430034129692833,0.013872423223718173,0
|
6 |
+
arc_challenge,acc_norm,0.34982935153583616,0.013936809212158284,0
|
7 |
+
arc_easy,acc,0.672979797979798,0.009626235849372207,0
|
8 |
+
arc_easy,acc_norm,0.6553030303030303,0.009752321586569784,0
|
9 |
+
boolq,acc,0.6464831804281346,0.008361346005339394,1
|
10 |
+
cb,acc,0.375,0.06527912098338669,1
|
11 |
+
cb,f1,0.28595317725752506,,1
|
12 |
+
copa,acc,0.89,0.03144660377352203,0
|
13 |
+
hellaswag,acc,0.5285799641505676,0.004981623292196192,0
|
14 |
+
hellaswag,acc_norm,0.7057359091814379,0.00454779896412668,0
|
15 |
+
piqa,acc,0.7774755168661589,0.009704600975718245,0
|
16 |
+
piqa,acc_norm,0.7861806311207835,0.009565994206915606,0
|
17 |
+
rte,acc,0.5451263537906137,0.029973636495415255,0
|
18 |
+
sciq,acc,0.929,0.008125578442487923,0
|
19 |
+
sciq,acc_norm,0.923,0.008434580140240644,0
|
20 |
+
storycloze_2016,acc,0.757883484767504,0.009905870033193868,0
|
21 |
+
winogrande,acc,0.6527229676400947,0.013380909249751242,0
|
8b7178b44b/evaluation/rankeval/8b7178b44b_2_lm-eval_global_step84877_2023-01-31-11-38-06_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.32,
|
5 |
-
"acc_stderr": 0.01475865230357487
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.335,
|
9 |
-
"acc_stderr": 0.014933117490932566
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33916666666666667,
|
13 |
-
"acc_stderr": 0.013672343491681812
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.375,
|
17 |
-
"acc_stderr": 0.06527912098338669,
|
18 |
-
"f1": 0.28595317725752506
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.89,
|
22 |
-
"acc_stderr": 0.03144660377352203
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5285799641505676,
|
26 |
-
"acc_stderr": 0.004981623292196192,
|
27 |
-
"acc_norm": 0.7057359091814379,
|
28 |
-
"acc_norm_stderr": 0.00454779896412668
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5451263537906137,
|
32 |
-
"acc_stderr": 0.029973636495415255
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6527229676400947,
|
36 |
-
"acc_stderr": 0.013380909249751242
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.757883484767504,
|
40 |
-
"acc_stderr": 0.009905870033193868
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6464831804281346,
|
44 |
-
"acc_stderr": 0.008361346005339394
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.672979797979798,
|
48 |
-
"acc_stderr": 0.009626235849372207,
|
49 |
-
"acc_norm": 0.6553030303030303,
|
50 |
-
"acc_norm_stderr": 0.009752321586569784
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3430034129692833,
|
54 |
-
"acc_stderr": 0.013872423223718173,
|
55 |
-
"acc_norm": 0.34982935153583616,
|
56 |
-
"acc_norm_stderr": 0.013936809212158284
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.929,
|
60 |
-
"acc_stderr": 0.008125578442487923,
|
61 |
-
"acc_norm": 0.923,
|
62 |
-
"acc_norm_stderr": 0.008434580140240644
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7774755168661589,
|
66 |
-
"acc_stderr": 0.009704600975718245,
|
67 |
-
"acc_norm": 0.7861806311207835,
|
68 |
-
"acc_norm_stderr": 0.009565994206915606
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b44b/evaluation/rankeval/8b7178b44b_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.349,0.015080663991563098,0
|
3 |
+
anli_r2,acc,0.345,0.015039986742055242,0
|
4 |
+
anli_r3,acc,0.3466666666666667,0.013744022550571956,0
|
5 |
+
arc_challenge,acc,0.3395904436860068,0.01383903976282016,0
|
6 |
+
arc_challenge,acc_norm,0.3660409556313993,0.014077223108470142,0
|
7 |
+
arc_easy,acc,0.6839225589225589,0.009540440071928283,0
|
8 |
+
arc_easy,acc_norm,0.6683501683501684,0.009660733780923948,0
|
9 |
+
boolq,acc,0.6590214067278287,0.00829097981816109,1
|
10 |
+
cb,acc,0.48214285714285715,0.0673769750864465,1
|
11 |
+
cb,f1,0.43206548866926225,,1
|
12 |
+
copa,acc,0.84,0.03684529491774709,0
|
13 |
+
hellaswag,acc,0.5317665803624776,0.004979700695747948,0
|
14 |
+
hellaswag,acc_norm,0.7076279625572595,0.004539227260397019,0
|
15 |
+
piqa,acc,0.7905331882480957,0.009494302979819794,0
|
16 |
+
piqa,acc_norm,0.7927094668117519,0.009457844699952372,0
|
17 |
+
rte,acc,0.51985559566787,0.030072723167317177,0
|
18 |
+
sciq,acc,0.935,0.007799733061832017,0
|
19 |
+
sciq,acc_norm,0.929,0.008125578442487916,0
|
20 |
+
storycloze_2016,acc,0.7573490112239444,0.009913300265342056,0
|
21 |
+
winogrande,acc,0.6432517758484609,0.013463393958028726,0
|
8b7178b44b/evaluation/rankeval/8b7178b44b_3_lm-eval_global_step84877_2023-01-31-11-38-06_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.349,
|
5 |
-
"acc_stderr": 0.015080663991563098
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.345,
|
9 |
-
"acc_stderr": 0.015039986742055242
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3466666666666667,
|
13 |
-
"acc_stderr": 0.013744022550571956
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.48214285714285715,
|
17 |
-
"acc_stderr": 0.0673769750864465,
|
18 |
-
"f1": 0.43206548866926225
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.84,
|
22 |
-
"acc_stderr": 0.03684529491774709
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5317665803624776,
|
26 |
-
"acc_stderr": 0.004979700695747948,
|
27 |
-
"acc_norm": 0.7076279625572595,
|
28 |
-
"acc_norm_stderr": 0.004539227260397019
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.51985559566787,
|
32 |
-
"acc_stderr": 0.030072723167317177
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6432517758484609,
|
36 |
-
"acc_stderr": 0.013463393958028726
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7573490112239444,
|
40 |
-
"acc_stderr": 0.009913300265342056
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6590214067278287,
|
44 |
-
"acc_stderr": 0.00829097981816109
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6839225589225589,
|
48 |
-
"acc_stderr": 0.009540440071928283,
|
49 |
-
"acc_norm": 0.6683501683501684,
|
50 |
-
"acc_norm_stderr": 0.009660733780923948
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.3395904436860068,
|
54 |
-
"acc_stderr": 0.01383903976282016,
|
55 |
-
"acc_norm": 0.3660409556313993,
|
56 |
-
"acc_norm_stderr": 0.014077223108470142
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.935,
|
60 |
-
"acc_stderr": 0.007799733061832017,
|
61 |
-
"acc_norm": 0.929,
|
62 |
-
"acc_norm_stderr": 0.008125578442487916
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7905331882480957,
|
66 |
-
"acc_stderr": 0.009494302979819794,
|
67 |
-
"acc_norm": 0.7927094668117519,
|
68 |
-
"acc_norm_stderr": 0.009457844699952372
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b44b/evaluation/rankeval/8b7178b44b_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.352,0.015110404505648664,0
|
3 |
+
anli_r2,acc,0.354,0.015129868238451773,0
|
4 |
+
anli_r3,acc,0.3433333333333333,0.01371263383046586,0
|
5 |
+
arc_challenge,acc,0.35665529010238906,0.013998056902620199,0
|
6 |
+
arc_challenge,acc_norm,0.3677474402730375,0.014090995618168468,0
|
7 |
+
arc_easy,acc,0.6893939393939394,0.009495260551195608,0
|
8 |
+
arc_easy,acc_norm,0.6750841750841751,0.00961020360450482,0
|
9 |
+
boolq,acc,0.6614678899082569,0.008276502626477437,1
|
10 |
+
cb,acc,0.5,0.06741998624632421,1
|
11 |
+
cb,f1,0.3770850423844681,,1
|
12 |
+
copa,acc,0.84,0.0368452949177471,0
|
13 |
+
hellaswag,acc,0.530372435769767,0.004980566907790448,0
|
14 |
+
hellaswag,acc_norm,0.7117108145787692,0.00452040633108404,0
|
15 |
+
piqa,acc,0.7872687704026116,0.00954822312304734,0
|
16 |
+
piqa,acc_norm,0.7889009793253536,0.00952137737873415,0
|
17 |
+
rte,acc,0.5090252707581228,0.030091559826331334,0
|
18 |
+
sciq,acc,0.938,0.007629823996280308,0
|
19 |
+
sciq,acc_norm,0.928,0.008178195576218681,0
|
20 |
+
storycloze_2016,acc,0.7669695350080171,0.009776301898548037,0
|
21 |
+
winogrande,acc,0.6511444356748224,0.013395059320137327,0
|
8b7178b44b/evaluation/rankeval/8b7178b44b_4_lm-eval_global_step84877_2023-01-31-11-38-06_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.352,
|
5 |
-
"acc_stderr": 0.015110404505648664
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.354,
|
9 |
-
"acc_stderr": 0.015129868238451773
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3433333333333333,
|
13 |
-
"acc_stderr": 0.01371263383046586
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5,
|
17 |
-
"acc_stderr": 0.06741998624632421,
|
18 |
-
"f1": 0.3770850423844681
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.84,
|
22 |
-
"acc_stderr": 0.0368452949177471
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.530372435769767,
|
26 |
-
"acc_stderr": 0.004980566907790448,
|
27 |
-
"acc_norm": 0.7117108145787692,
|
28 |
-
"acc_norm_stderr": 0.00452040633108404
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5090252707581228,
|
32 |
-
"acc_stderr": 0.030091559826331334
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6511444356748224,
|
36 |
-
"acc_stderr": 0.013395059320137327
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7669695350080171,
|
40 |
-
"acc_stderr": 0.009776301898548037
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6614678899082569,
|
44 |
-
"acc_stderr": 0.008276502626477437
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6893939393939394,
|
48 |
-
"acc_stderr": 0.009495260551195608,
|
49 |
-
"acc_norm": 0.6750841750841751,
|
50 |
-
"acc_norm_stderr": 0.00961020360450482
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.35665529010238906,
|
54 |
-
"acc_stderr": 0.013998056902620199,
|
55 |
-
"acc_norm": 0.3677474402730375,
|
56 |
-
"acc_norm_stderr": 0.014090995618168468
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.938,
|
60 |
-
"acc_stderr": 0.007629823996280308,
|
61 |
-
"acc_norm": 0.928,
|
62 |
-
"acc_norm_stderr": 0.008178195576218681
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7872687704026116,
|
66 |
-
"acc_stderr": 0.00954822312304734,
|
67 |
-
"acc_norm": 0.7889009793253536,
|
68 |
-
"acc_norm_stderr": 0.00952137737873415
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b44b/evaluation/rankeval/8b7178b44b_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.332,0.014899597242811478,0
|
3 |
+
anli_r2,acc,0.315,0.014696631960792492,0
|
4 |
+
anli_r3,acc,0.3233333333333333,0.013508372867300212,0
|
5 |
+
arc_challenge,acc,0.35921501706484643,0.014020224155839141,0
|
6 |
+
arc_challenge,acc_norm,0.3651877133105802,0.014070265519268804,0
|
7 |
+
arc_easy,acc,0.6902356902356902,0.00948817285190372,0
|
8 |
+
arc_easy,acc_norm,0.6734006734006734,0.009623047038267657,0
|
9 |
+
boolq,acc,0.6688073394495413,0.008231583858517822,1
|
10 |
+
cb,acc,0.5,0.06741998624632421,1
|
11 |
+
cb,f1,0.36179337231968806,,1
|
12 |
+
copa,acc,0.85,0.035887028128263734,0
|
13 |
+
hellaswag,acc,0.5313682533359888,0.004979952166595539,0
|
14 |
+
hellaswag,acc_norm,0.7123083051185023,0.004517614647703246,0
|
15 |
+
piqa,acc,0.7763873775843307,0.009721489519176297,0
|
16 |
+
piqa,acc_norm,0.7910772578890098,0.009485227030105093,0
|
17 |
+
rte,acc,0.5595667870036101,0.029882123363118712,0
|
18 |
+
sciq,acc,0.937,0.007687007876286423,0
|
19 |
+
sciq,acc_norm,0.931,0.008018934050315158,0
|
20 |
+
storycloze_2016,acc,0.7530732228754676,0.00997199136038898,0
|
21 |
+
winogrande,acc,0.6495659037095501,0.013409047676670192,0
|
8b7178b44b/evaluation/rankeval/8b7178b44b_5.json
CHANGED
@@ -48,6 +48,24 @@
|
|
48 |
"acc_stderr": 0.00948817285190372,
|
49 |
"acc_norm": 0.6734006734006734,
|
50 |
"acc_norm_stderr": 0.009623047038267657
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
}
|
52 |
},
|
53 |
"versions": {
|
@@ -61,6 +79,9 @@
|
|
61 |
"winogrande": 0,
|
62 |
"storycloze_2016": 0,
|
63 |
"boolq": 1,
|
64 |
-
"arc_easy": 0
|
|
|
|
|
|
|
65 |
}
|
66 |
}
|
|
|
48 |
"acc_stderr": 0.00948817285190372,
|
49 |
"acc_norm": 0.6734006734006734,
|
50 |
"acc_norm_stderr": 0.009623047038267657
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.35921501706484643,
|
54 |
+
"acc_stderr": 0.014020224155839141,
|
55 |
+
"acc_norm": 0.3651877133105802,
|
56 |
+
"acc_norm_stderr": 0.014070265519268804
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.937,
|
60 |
+
"acc_stderr": 0.007687007876286423,
|
61 |
+
"acc_norm": 0.931,
|
62 |
+
"acc_norm_stderr": 0.008018934050315158
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7763873775843307,
|
66 |
+
"acc_stderr": 0.009721489519176297,
|
67 |
+
"acc_norm": 0.7910772578890098,
|
68 |
+
"acc_norm_stderr": 0.009485227030105093
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
79 |
"winogrande": 0,
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
8b7178b44b/evaluation/rankeval/8b7178b44b_5_lm-eval_global_step84877_2023-01-31-11-38-06_5shots_backup.json
DELETED
@@ -1,66 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.332,
|
5 |
-
"acc_stderr": 0.014899597242811478
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.315,
|
9 |
-
"acc_stderr": 0.014696631960792492
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3233333333333333,
|
13 |
-
"acc_stderr": 0.013508372867300212
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5,
|
17 |
-
"acc_stderr": 0.06741998624632421,
|
18 |
-
"f1": 0.36179337231968806
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.85,
|
22 |
-
"acc_stderr": 0.035887028128263734
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5313682533359888,
|
26 |
-
"acc_stderr": 0.004979952166595539,
|
27 |
-
"acc_norm": 0.7123083051185023,
|
28 |
-
"acc_norm_stderr": 0.004517614647703246
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5595667870036101,
|
32 |
-
"acc_stderr": 0.029882123363118712
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6495659037095501,
|
36 |
-
"acc_stderr": 0.013409047676670192
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7530732228754676,
|
40 |
-
"acc_stderr": 0.00997199136038898
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6688073394495413,
|
44 |
-
"acc_stderr": 0.008231583858517822
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6902356902356902,
|
48 |
-
"acc_stderr": 0.00948817285190372,
|
49 |
-
"acc_norm": 0.6734006734006734,
|
50 |
-
"acc_norm_stderr": 0.009623047038267657
|
51 |
-
}
|
52 |
-
},
|
53 |
-
"versions": {
|
54 |
-
"anli_r1": 0,
|
55 |
-
"anli_r2": 0,
|
56 |
-
"anli_r3": 0,
|
57 |
-
"cb": 1,
|
58 |
-
"copa": 0,
|
59 |
-
"hellaswag": 0,
|
60 |
-
"rte": 0,
|
61 |
-
"winogrande": 0,
|
62 |
-
"storycloze_2016": 0,
|
63 |
-
"boolq": 1,
|
64 |
-
"arc_easy": 0
|
65 |
-
}
|
66 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b58b/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.009544918858239404
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.009544918858239404
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.19809860044914396
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.19809860044914396
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.22321575010543798
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.22321575010543798
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2289130980630063
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2289130980630063
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2331128191296418
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2331128191296418
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.22941163204827142
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.22941163204827142
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.18704946977562348
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05581311382540214
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.05581311382540214
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04842756093378267
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.04842756093378267
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05372147678733591
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.05372147678733591
|
21 |
+
gem_xsum,2,average,multiple,0.05265405051550691
|
22 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05525608727734654
|
23 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.05525608727734654
|
24 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.059610216940881165
|
25 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.059610216940881165
|
26 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.059094553406324295
|
27 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.059094553406324295
|
28 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05861591086080908
|
29 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.05861591086080908
|
30 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05896624699685776
|
31 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.05896624699685776
|
32 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.059676320234352584
|
33 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.059676320234352584
|
34 |
+
web_nlg_en,5,average,multiple,0.0585365559527619
|
35 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.037350220833466265
|
36 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.037350220833466265
|
37 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06579562483959114
|
38 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.06579562483959114
|
39 |
+
wiki_lingua_en,1,average,multiple,0.0515729228365287
|
8b7178b58b/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3677810394757023, "bleu_stderr": 0.03281723526619896, "rouge1_fmeasure": 0.11606407601489055, "rouge1_fmeasure_stderr": 0.001969146758246114, "rouge1_precision": 0.07551756953809975, "rouge1_precision_stderr": 0.0014428401024288566, "rouge1_recall": 0.32667793832637043, "rouge1_recall_stderr": 0.0045432759692835755, "rouge2_fmeasure": 0.05525608727734654, "rouge2_fmeasure_stderr": 0.0012506172623425777, "rouge2_precision": 0.0358576440999003, "rouge2_precision_stderr": 0.0009026300665133467, "rouge2_recall": 0.16122516666992529, "rouge2_recall_stderr": 0.003267212716096976, "rougeL_fmeasure": 0.11192452656660684, "rougeL_fmeasure_stderr": 0.00184821865839915, "rougeL_precision": 0.07265841861456275, "rougeL_precision_stderr": 0.0013478808767448548, "rougeL_recall": 0.3173688843344159, "rougeL_recall_stderr": 0.004415574211500117, "rougeLsum_fmeasure": 0.11130422275494882, "rougeLsum_fmeasure_stderr": 0.0018686028478286372, "rougeLsum_precision": 0.07243892580671336, "rougeLsum_precision_stderr": 0.0013760467900819921, "rougeLsum_recall": 0.3134569235593474, "rougeLsum_recall_stderr": 0.004287283373230945}}, "1": {"PALM_prompt": {"bleu": 0.5497938903834476, "bleu_stderr": 0.030329648907717416, "rouge1_fmeasure": 0.12377098272865582, "rouge1_fmeasure_stderr": 0.0018005918585052136, "rouge1_precision": 0.07921341676188258, "rouge1_precision_stderr": 0.0013409822377619385, "rouge1_recall": 0.39870747987203026, "rouge1_recall_stderr": 0.005183434021587761, "rouge2_fmeasure": 0.059610216940881165, "rouge2_fmeasure_stderr": 0.0011877157461269233, "rouge2_precision": 0.037977631085495546, "rouge2_precision_stderr": 0.0008537095990628626, "rouge2_recall": 0.20482620328537757, "rouge2_recall_stderr": 0.003925783970259362, "rougeL_fmeasure": 0.11717642785021219, "rougeL_fmeasure_stderr": 0.001674344616420258, "rougeL_precision": 0.07494409148525415, "rougeL_precision_stderr": 0.0012339020353945417, "rougeL_recall": 0.3749296897875364, "rougeL_recall_stderr": 0.004746737866261382, "rougeLsum_fmeasure": 0.1180104667449292, "rougeLsum_fmeasure_stderr": 0.0017037357470238706, "rougeLsum_precision": 0.07561615649375214, "rougeLsum_precision_stderr": 0.001274630336916664, "rougeLsum_recall": 0.3781401543982665, "rougeLsum_recall_stderr": 0.004772450320283416}}, "2": {"PALM_prompt": {"bleu": 0.6345753558723145, "bleu_stderr": 0.022376664491002615, "rouge1_fmeasure": 0.12360575844131859, "rouge1_fmeasure_stderr": 0.0016749434032436248, "rouge1_precision": 0.07795669324486854, "rouge1_precision_stderr": 0.00119684640928671, "rouge1_recall": 0.41803228663523345, "rouge1_recall_stderr": 0.0053864905798921125, "rouge2_fmeasure": 0.059094553406324295, "rouge2_fmeasure_stderr": 0.0010986707292730262, "rouge2_precision": 0.03697168943889819, "rouge2_precision_stderr": 0.0007524504443837138, "rouge2_recall": 0.21717607459640267, "rouge2_recall_stderr": 0.004101036268432045, "rougeL_fmeasure": 0.1151080347627734, "rougeL_fmeasure_stderr": 0.0015232281204089519, "rougeL_precision": 0.07270569406411607, "rougeL_precision_stderr": 0.0010983482762338668, "rougeL_recall": 0.38609502456284567, "rougeL_recall_stderr": 0.004797386037833962, "rougeLsum_fmeasure": 0.11783674396217436, "rougeLsum_fmeasure_stderr": 0.0015879949279193565, "rougeLsum_precision": 0.07442415306126304, "rougeLsum_precision_stderr": 0.0011448850348819214, "rougeLsum_recall": 0.396883636105679, "rougeLsum_recall_stderr": 0.0049884922683259055}}, "3": {"PALM_prompt": {"bleu": 0.6611202603466564, "bleu_stderr": 0.026292760196011896, "rouge1_fmeasure": 0.12270682587650009, "rouge1_fmeasure_stderr": 0.001689827532535868, "rouge1_precision": 0.07720993388700956, "rouge1_precision_stderr": 0.0012101497257156794, "rouge1_recall": 0.4133639192496906, "rouge1_recall_stderr": 0.005411628648726416, "rouge2_fmeasure": 0.05861591086080908, "rouge2_fmeasure_stderr": 0.0010907460831452867, "rouge2_precision": 0.03655832324841317, "rouge2_precision_stderr": 0.0007506453443122757, "rouge2_recall": 0.21585685095591742, "rouge2_recall_stderr": 0.004093725567157412, "rougeL_fmeasure": 0.11361332097070079, "rougeL_fmeasure_stderr": 0.0015022331310385422, "rougeL_precision": 0.07151482183987531, "rougeL_precision_stderr": 0.001086199341044554, "rougeL_recall": 0.3823649104455851, "rougeL_recall_stderr": 0.004813722726129101, "rougeLsum_fmeasure": 0.11674347677370367, "rougeLsum_fmeasure_stderr": 0.0015911049077229342, "rougeLsum_precision": 0.07350481492884033, "rougeLsum_precision_stderr": 0.0011485603851221294, "rougeLsum_recall": 0.39303289732256513, "rougeLsum_recall_stderr": 0.005032921309423209}}, "4": {"PALM_prompt": {"bleu": 0.7128978831232388, "bleu_stderr": 0.03892571091725707, "rouge1_fmeasure": 0.12369444007480314, "rouge1_fmeasure_stderr": 0.0016461113907076013, "rouge1_precision": 0.07777752463393256, "rouge1_precision_stderr": 0.0011834566593491008, "rouge1_recall": 0.4222346728535934, "rouge1_recall_stderr": 0.005371874334785877, "rouge2_fmeasure": 0.05896624699685776, "rouge2_fmeasure_stderr": 0.0010852712903198752, "rouge2_precision": 0.03670980373588488, "rouge2_precision_stderr": 0.0007435896517793837, "rouge2_recall": 0.2212321108756897, "rouge2_recall_stderr": 0.004139207390440789, "rougeL_fmeasure": 0.11384905074014122, "rougeL_fmeasure_stderr": 0.00146289053656463, "rougeL_precision": 0.07163006841296378, "rougeL_precision_stderr": 0.001061488881603934, "rougeL_recall": 0.3878434903814413, "rougeL_recall_stderr": 0.004773394354686469, "rougeLsum_fmeasure": 0.11795118372977331, "rougeLsum_fmeasure_stderr": 0.001557940253935983, "rougeLsum_precision": 0.07421506767947293, "rougeLsum_precision_stderr": 0.001124685544226384, "rougeLsum_recall": 0.40133513783035873, "rougeLsum_recall_stderr": 0.005016786537745759}}, "5": {"PALM_prompt": {"bleu": 0.7296500968020738, "bleu_stderr": 0.03986679058747073, "rouge1_fmeasure": 0.12440202385333685, "rouge1_fmeasure_stderr": 0.0016439820305072118, "rouge1_precision": 0.07793578273286862, "rouge1_precision_stderr": 0.001170086916396576, "rouge1_recall": 0.4293926231707037, "rouge1_recall_stderr": 0.005544216357063792, "rouge2_fmeasure": 0.059676320234352584, "rouge2_fmeasure_stderr": 0.0010914957404078694, "rouge2_precision": 0.03706679304975263, "rouge2_precision_stderr": 0.0007461793226765045, "rouge2_recall": 0.22614254310251874, "rouge2_recall_stderr": 0.004258603721661164, "rougeL_fmeasure": 0.11426293462896298, "rougeL_fmeasure_stderr": 0.001470255169191014, "rougeL_precision": 0.07171154548923617, "rougeL_precision_stderr": 0.001063717283795246, "rougeL_recall": 0.39264622495319934, "rougeL_recall_stderr": 0.004853098099750933, "rougeLsum_fmeasure": 0.11846322387156706, "rougeLsum_fmeasure_stderr": 0.0015476540324461025, "rougeLsum_precision": 0.07430376688251292, "rougeLsum_precision_stderr": 0.0011115454477617379, "rougeLsum_recall": 0.40784183583085276, "rougeLsum_recall_stderr": 0.005148423771775361}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.6588537886073913, "bleu_stderr": 0.0443627487585664, "rouge1_fmeasure": 0.1771116987872097, "rouge1_fmeasure_stderr": 0.0019001225147585595, "rouge1_precision": 0.15071008120560497, "rouge1_precision_stderr": 0.0019169944124831117, "rouge1_recall": 0.2615215631789747, "rouge1_recall_stderr": 0.0028587647553713004, "rouge2_fmeasure": 0.037350220833466265, "rouge2_fmeasure_stderr": 0.0009013013631899656, "rouge2_precision": 0.03154759752710423, "rouge2_precision_stderr": 0.0007938342962238314, "rouge2_recall": 0.05718093036037643, "rouge2_recall_stderr": 0.0015126905612736802, "rougeL_fmeasure": 0.1360411096046868, "rougeL_fmeasure_stderr": 0.0013491547776702593, "rougeL_precision": 0.11439573579669693, "rougeL_precision_stderr": 0.0013313396769038728, "rougeL_recall": 0.2058353141373366, "rougeL_recall_stderr": 0.0022855149086852887, "rougeLsum_fmeasure": 0.16367916861611936, "rougeLsum_fmeasure_stderr": 0.0017419145199993307, "rougeLsum_precision": 0.13909515224541577, "rougeLsum_precision_stderr": 0.0017556231845242017, "rougeLsum_recall": 0.24244042499581767, "rougeLsum_recall_stderr": 0.0026519653366143562}}, "1": {"tldr_en": {"bleu": 3.5656470388016652, "bleu_stderr": 0.08197311523066556, "rouge1_fmeasure": 0.24342094967396746, "rouge1_fmeasure_stderr": 0.002061008293601447, "rouge1_precision": 0.2174735691686053, "rouge1_precision_stderr": 0.0024727115624900444, "rouge1_recall": 0.3460178647953164, "rouge1_recall_stderr": 0.0029105441665587413, "rouge2_fmeasure": 0.06579562483959114, "rouge2_fmeasure_stderr": 0.0012229149405673168, "rouge2_precision": 0.05974334784565509, "rouge2_precision_stderr": 0.0013520311423251665, "rouge2_recall": 0.09579105477293766, "rouge2_recall_stderr": 0.0019333447701627138, "rougeL_fmeasure": 0.1715894965219696, "rougeL_fmeasure_stderr": 0.0014647407258985053, "rougeL_precision": 0.15264641094715217, "rougeL_precision_stderr": 0.0017989249509303383, "rougeL_recall": 0.24988397509922583, "rougeL_recall_stderr": 0.0023954858097757943, "rougeLsum_fmeasure": 0.2293964977081046, "rougeLsum_fmeasure_stderr": 0.0019478558777254497, "rougeLsum_precision": 0.20485722652733274, "rougeLsum_precision_stderr": 0.002345428003149735, "rougeLsum_recall": 0.32704916730419714, "rougeLsum_recall_stderr": 0.0027965860033602368}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.0829790820445851, "bleu_stderr": 0.02197797601631595, "rouge1_fmeasure": 0.10446518863159925, "rouge1_fmeasure_stderr": 0.000777744935734902, "rouge1_precision": 0.07862879559521596, "rouge1_precision_stderr": 0.0006642205609450126, "rouge1_recall": 0.16475897375079807, "rouge1_recall_stderr": 0.0011383196654042192, "rouge2_fmeasure": 0.009544918858239404, "rouge2_fmeasure_stderr": 0.00027235599690889593, "rouge2_precision": 0.007369812030601589, "rouge2_precision_stderr": 0.00021896094396311517, "rouge2_recall": 0.014262052938121144, "rouge2_recall_stderr": 0.0004147127585356459, "rougeL_fmeasure": 0.1010125389757392, "rougeL_fmeasure_stderr": 0.0007159378810539217, "rougeL_precision": 0.07591871687158004, "rougeL_precision_stderr": 0.000610191190689566, "rougeL_recall": 0.1597865992371287, "rougeL_recall_stderr": 0.0010790075675615354, "rougeLsum_fmeasure": 0.08956898548608715, "rougeLsum_fmeasure_stderr": 0.0006418100960192206, "rougeLsum_precision": 0.0673047657702021, "rougeLsum_precision_stderr": 0.0005522793921438968, "rougeLsum_recall": 0.14195098898955258, "rougeLsum_recall_stderr": 0.0009730497278821901}}, "1": {"generate_text_restaurant": {"bleu": 10.908919859430615, "bleu_stderr": 0.16637332277469416, "rouge1_fmeasure": 0.43946652639475914, "rouge1_fmeasure_stderr": 0.0020459488608388305, "rouge1_precision": 0.4456385087012675, "rouge1_precision_stderr": 0.002353995044749726, "rouge1_recall": 0.47020454378467724, "rouge1_recall_stderr": 0.002936861869075764, "rouge2_fmeasure": 0.19809860044914396, "rouge2_fmeasure_stderr": 0.0017761118795827473, "rouge2_precision": 0.20055830506980424, "rouge2_precision_stderr": 0.0018894709575218048, "rouge2_recall": 0.2135715698673984, "rouge2_recall_stderr": 0.002164535077464, "rougeL_fmeasure": 0.31158605078339746, "rougeL_fmeasure_stderr": 0.0017722355642195858, "rougeL_precision": 0.31683119553670347, "rougeL_precision_stderr": 0.00202952794744226, "rougeL_recall": 0.33327675710241655, "rougeL_recall_stderr": 0.0023834364279679216, "rougeLsum_fmeasure": 0.36402615233989766, "rougeLsum_fmeasure_stderr": 0.0020471793436135748, "rougeLsum_precision": 0.36992976009148365, "rougeLsum_precision_stderr": 0.0023186341348778463, "rougeLsum_recall": 0.38880934305270765, "rougeLsum_recall_stderr": 0.0027097894189422028}}, "2": {"generate_text_restaurant": {"bleu": 12.496024480959404, "bleu_stderr": 0.17088930912135533, "rouge1_fmeasure": 0.46766982775713095, "rouge1_fmeasure_stderr": 0.001918126464011959, "rouge1_precision": 0.46513050554651497, "rouge1_precision_stderr": 0.002223586404391749, "rouge1_recall": 0.5044545963797141, "rouge1_recall_stderr": 0.0028151136740913597, "rouge2_fmeasure": 0.22321575010543798, "rouge2_fmeasure_stderr": 0.0018224289381751444, "rouge2_precision": 0.22174324642559978, "rouge2_precision_stderr": 0.0019100067340740226, "rouge2_recall": 0.24291920242568943, "rouge2_recall_stderr": 0.0022611578365930884, "rougeL_fmeasure": 0.3381497040905446, "rougeL_fmeasure_stderr": 0.0017546749447990841, "rougeL_precision": 0.3367051458657208, "rougeL_precision_stderr": 0.001973018443090721, "rougeL_recall": 0.36493130556649644, "rougeL_recall_stderr": 0.002393148715464209, "rougeLsum_fmeasure": 0.3916198529515355, "rougeLsum_fmeasure_stderr": 0.0020225802259552245, "rougeLsum_precision": 0.3898243742937925, "rougeLsum_precision_stderr": 0.002253177427026344, "rougeLsum_recall": 0.421949500869911, "rougeLsum_recall_stderr": 0.002689886396084085}}, "3": {"generate_text_restaurant": {"bleu": 12.876624238007336, "bleu_stderr": 0.13633367748001868, "rouge1_fmeasure": 0.47400023722312934, "rouge1_fmeasure_stderr": 0.001900202291806821, "rouge1_precision": 0.4666088727251901, "rouge1_precision_stderr": 0.0022313332553631853, "rouge1_recall": 0.5154695171919821, "rouge1_recall_stderr": 0.0027754047400933637, "rouge2_fmeasure": 0.2289130980630063, "rouge2_fmeasure_stderr": 0.0018363466703835796, "rouge2_precision": 0.2243865656346707, "rouge2_precision_stderr": 0.0018719494043509864, "rouge2_recall": 0.25156840321883794, "rouge2_recall_stderr": 0.0023063040992685763, "rougeL_fmeasure": 0.34349469052543125, "rougeL_fmeasure_stderr": 0.0017677223953016846, "rougeL_precision": 0.3380898171473523, "rougeL_precision_stderr": 0.0019534178000102315, "rougeL_recall": 0.37434995371411073, "rougeL_recall_stderr": 0.002444470010384392, "rougeLsum_fmeasure": 0.3977755285897407, "rougeLsum_fmeasure_stderr": 0.0020093339319118943, "rougeLsum_precision": 0.39156150543550683, "rougeLsum_precision_stderr": 0.0022264717816469198, "rougeLsum_recall": 0.43267538620734924, "rougeLsum_recall_stderr": 0.0027083330913664685}}, "4": {"generate_text_restaurant": {"bleu": 13.038419795082826, "bleu_stderr": 0.1976428408924392, "rouge1_fmeasure": 0.47584611404695387, "rouge1_fmeasure_stderr": 0.001947074296264647, "rouge1_precision": 0.466899050531228, "rouge1_precision_stderr": 0.0022803800444716094, "rouge1_recall": 0.5180929741571967, "rouge1_recall_stderr": 0.0027666521523327485, "rouge2_fmeasure": 0.2331128191296418, "rouge2_fmeasure_stderr": 0.0018880871361337953, "rouge2_precision": 0.22797984329863402, "rouge2_precision_stderr": 0.0019493427856802211, "rouge2_recall": 0.2562999197032082, "rouge2_recall_stderr": 0.002335812648509201, "rougeL_fmeasure": 0.34478040188814424, "rougeL_fmeasure_stderr": 0.0018168580160894932, "rougeL_precision": 0.33801665068739845, "rougeL_precision_stderr": 0.0019999447186985647, "rougeL_recall": 0.37648151811835306, "rougeL_recall_stderr": 0.0024685261116801542, "rougeLsum_fmeasure": 0.40002040198984967, "rougeLsum_fmeasure_stderr": 0.00209065471547177, "rougeLsum_precision": 0.3924955108326523, "rougeLsum_precision_stderr": 0.0023101764008543304, "rougeLsum_recall": 0.4357423190744161, "rougeLsum_recall_stderr": 0.0027563832782966625}}, "5": {"generate_text_restaurant": {"bleu": 12.732763368074703, "bleu_stderr": 0.17590369930486258, "rouge1_fmeasure": 0.47424433103008173, "rouge1_fmeasure_stderr": 0.0019121232239086051, "rouge1_precision": 0.46421051231524674, "rouge1_precision_stderr": 0.002228291979514538, "rouge1_recall": 0.5154379150999142, "rouge1_recall_stderr": 0.0026933286526690937, "rouge2_fmeasure": 0.22941163204827142, "rouge2_fmeasure_stderr": 0.001863304249033461, "rouge2_precision": 0.2240248553808123, "rouge2_precision_stderr": 0.001895223667089071, "rouge2_recall": 0.2513696885147789, "rouge2_recall_stderr": 0.0022849010356271104, "rougeL_fmeasure": 0.34311455561819293, "rougeL_fmeasure_stderr": 0.001783681336394135, "rougeL_precision": 0.3353528060064174, "rougeL_precision_stderr": 0.0019239210092200653, "rougeL_recall": 0.37390031423496406, "rougeL_recall_stderr": 0.0023965157476151687, "rougeLsum_fmeasure": 0.3973658692506892, "rougeLsum_fmeasure_stderr": 0.0020284980495066177, "rougeLsum_precision": 0.38889217013284716, "rougeLsum_precision_stderr": 0.0022276201914498226, "rougeLsum_recall": 0.43194157222453305, "rougeLsum_recall_stderr": 0.002653041029715851}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.379481370668616, "bleu_stderr": 0.06717438559240217, "rouge1_fmeasure": 0.22307907562150814, "rouge1_fmeasure_stderr": 0.002574919259740255, "rouge1_precision": 0.1641976469365769, "rouge1_precision_stderr": 0.0020666411427313226, "rouge1_recall": 0.3769795179589196, "rouge1_recall_stderr": 0.004557040772745154, "rouge2_fmeasure": 0.05581311382540214, "rouge2_fmeasure_stderr": 0.0017319768283373825, "rouge2_precision": 0.040059412047117826, "rouge2_precision_stderr": 0.0012577854209926665, "rouge2_recall": 0.09876412540351458, "rouge2_recall_stderr": 0.0031994931770752293, "rougeL_fmeasure": 0.16469766371025393, "rougeL_fmeasure_stderr": 0.0019416684869235467, "rougeL_precision": 0.12066732888153141, "rougeL_precision_stderr": 0.001491352270771391, "rougeL_recall": 0.2811429796223486, "rougeL_recall_stderr": 0.0036958208158303246, "rougeLsum_fmeasure": 0.1764735128921134, "rougeLsum_fmeasure_stderr": 0.0022058663139781407, "rougeLsum_precision": 0.12918261249170612, "rougeLsum_precision_stderr": 0.0016727035049342064, "rougeLsum_recall": 0.30129412715173376, "rougeLsum_recall_stderr": 0.0041384700689898295}}, "1": {"article_DOC_summary": {"bleu": 2.0057184143447615, "bleu_stderr": 0.10784776015193966, "rouge1_fmeasure": 0.19977490507493711, "rouge1_fmeasure_stderr": 0.0027304130941086407, "rouge1_precision": 0.14227928785850447, "rouge1_precision_stderr": 0.002038647007352537, "rouge1_recall": 0.3496445389327938, "rouge1_recall_stderr": 0.004687420539003245, "rouge2_fmeasure": 0.04842756093378267, "rouge2_fmeasure_stderr": 0.0016766359525896973, "rouge2_precision": 0.0341741556467082, "rouge2_precision_stderr": 0.0011919230006458453, "rouge2_recall": 0.0868085117734084, "rouge2_recall_stderr": 0.0030751950321611407, "rougeL_fmeasure": 0.15299299492001714, "rougeL_fmeasure_stderr": 0.0020609720286366845, "rougeL_precision": 0.1087255275926617, "rougeL_precision_stderr": 0.0015201339104374042, "rougeL_recall": 0.26941360644447593, "rougeL_recall_stderr": 0.0036873426379033505, "rougeLsum_fmeasure": 0.1598451013344069, "rougeLsum_fmeasure_stderr": 0.0023180038401233254, "rougeLsum_precision": 0.11362838643634136, "rougeLsum_precision_stderr": 0.0017063249170823618, "rougeLsum_recall": 0.28136113073762375, "rougeLsum_recall_stderr": 0.0041294851110808455}}, "2": {"article_DOC_summary": {"bleu": 2.2208813740601956, "bleu_stderr": 0.08894604311937805, "rouge1_fmeasure": 0.208834898706681, "rouge1_fmeasure_stderr": 0.0026978364031301984, "rouge1_precision": 0.14892949469644043, "rouge1_precision_stderr": 0.00200583160476678, "rouge1_recall": 0.3634135497733337, "rouge1_recall_stderr": 0.004671515326255189, "rouge2_fmeasure": 0.05372147678733591, "rouge2_fmeasure_stderr": 0.0017727625629579478, "rouge2_precision": 0.037878856143553255, "rouge2_precision_stderr": 0.0012582826871233468, "rouge2_recall": 0.09625625668090708, "rouge2_recall_stderr": 0.0032497068654648584, "rougeL_fmeasure": 0.1628407034853261, "rougeL_fmeasure_stderr": 0.0020951830521425178, "rougeL_precision": 0.11593507352191997, "rougeL_precision_stderr": 0.0015458134885275524, "rougeL_recall": 0.2849234476380913, "rougeL_recall_stderr": 0.0037398830811592306, "rougeLsum_fmeasure": 0.16488303511797936, "rougeLsum_fmeasure_stderr": 0.0022824435253937184, "rougeLsum_precision": 0.1172906648583005, "rougeLsum_precision_stderr": 0.0016670598084846545, "rougeLsum_recall": 0.2888804067318797, "rougeLsum_recall_stderr": 0.004097916730726731}}}}
|
8b7178b58b/evaluation/rankeval/8b7178b58b_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.331,0.014888272588203934,0
|
3 |
+
anli_r2,acc,0.352,0.015110404505648668,0
|
4 |
+
anli_r3,acc,0.35583333333333333,0.01382651874849331,0
|
5 |
+
arc_challenge,acc,0.32081911262798635,0.013640943091946526,0
|
6 |
+
arc_challenge,acc_norm,0.33447098976109213,0.013787460322441374,0
|
7 |
+
arc_easy,acc,0.6637205387205387,0.009694178072725206,0
|
8 |
+
arc_easy,acc_norm,0.5896464646464646,0.010093531255765452,0
|
9 |
+
boolq,acc,0.6284403669724771,0.008451598145076575,1
|
10 |
+
cb,acc,0.2857142857142857,0.06091449038731724,1
|
11 |
+
cb,f1,0.1717171717171717,,1
|
12 |
+
copa,acc,0.8,0.040201512610368445,0
|
13 |
+
hellaswag,acc,0.5319657438757219,0.004979573765575866,0
|
14 |
+
hellaswag,acc_norm,0.7045409281019717,0.004553164013379556,0
|
15 |
+
piqa,acc,0.7731229597388466,0.009771584259215172,0
|
16 |
+
piqa,acc_norm,0.7829162132752993,0.009618708415756788,0
|
17 |
+
rte,acc,0.5667870036101083,0.029826764082138277,0
|
18 |
+
sciq,acc,0.89,0.00989939381972444,0
|
19 |
+
sciq,acc_norm,0.815,0.012285191326386684,0
|
20 |
+
storycloze_2016,acc,0.7525387493319081,0.009979234591920141,0
|
21 |
+
winogrande,acc,0.6243093922651933,0.013611257508380437,0
|
8b7178b58b/evaluation/rankeval/8b7178b58b_0_lm-eval_global_step84877_2023-01-31-11-38-06_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.331,
|
5 |
-
"acc_stderr": 0.014888272588203934
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.352,
|
9 |
-
"acc_stderr": 0.015110404505648668
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.35583333333333333,
|
13 |
-
"acc_stderr": 0.01382651874849331
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.2857142857142857,
|
17 |
-
"acc_stderr": 0.06091449038731724,
|
18 |
-
"f1": 0.1717171717171717
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.8,
|
22 |
-
"acc_stderr": 0.040201512610368445
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5319657438757219,
|
26 |
-
"acc_stderr": 0.004979573765575866,
|
27 |
-
"acc_norm": 0.7045409281019717,
|
28 |
-
"acc_norm_stderr": 0.004553164013379556
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5667870036101083,
|
32 |
-
"acc_stderr": 0.029826764082138277
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6243093922651933,
|
36 |
-
"acc_stderr": 0.013611257508380437
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7525387493319081,
|
40 |
-
"acc_stderr": 0.009979234591920141
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6284403669724771,
|
44 |
-
"acc_stderr": 0.008451598145076575
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6637205387205387,
|
48 |
-
"acc_stderr": 0.009694178072725206,
|
49 |
-
"acc_norm": 0.5896464646464646,
|
50 |
-
"acc_norm_stderr": 0.010093531255765452
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.32081911262798635,
|
54 |
-
"acc_stderr": 0.013640943091946526,
|
55 |
-
"acc_norm": 0.33447098976109213,
|
56 |
-
"acc_norm_stderr": 0.013787460322441374
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.89,
|
60 |
-
"acc_stderr": 0.00989939381972444,
|
61 |
-
"acc_norm": 0.815,
|
62 |
-
"acc_norm_stderr": 0.012285191326386684
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7731229597388466,
|
66 |
-
"acc_stderr": 0.009771584259215172,
|
67 |
-
"acc_norm": 0.7829162132752993,
|
68 |
-
"acc_norm_stderr": 0.009618708415756788
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b58b/evaluation/rankeval/8b7178b58b_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.293,0.014399942998441271,0
|
3 |
+
anli_r2,acc,0.327,0.01484221315341124,0
|
4 |
+
anli_r3,acc,0.3383333333333333,0.013664144006618275,0
|
5 |
+
arc_challenge,acc,0.33447098976109213,0.013787460322441384,0
|
6 |
+
arc_challenge,acc_norm,0.3438566552901024,0.01388064457015621,0
|
7 |
+
arc_easy,acc,0.6759259259259259,0.009603728850095394,0
|
8 |
+
arc_easy,acc_norm,0.640993265993266,0.009843424713072176,0
|
9 |
+
boolq,acc,0.6669724770642201,0.00824302391268888,1
|
10 |
+
cb,acc,0.32142857142857145,0.06297362289056341,1
|
11 |
+
cb,f1,0.2706949089557785,,1
|
12 |
+
copa,acc,0.77,0.042295258468165065,0
|
13 |
+
hellaswag,acc,0.5265883290181239,0.0049827214724073405,0
|
14 |
+
hellaswag,acc_norm,0.7029476199960167,0.00456025908319738,0
|
15 |
+
piqa,acc,0.7763873775843307,0.009721489519176294,0
|
16 |
+
piqa,acc_norm,0.7883569096844396,0.009530351270479392,0
|
17 |
+
rte,acc,0.5595667870036101,0.029882123363118723,0
|
18 |
+
sciq,acc,0.928,0.008178195576218681,0
|
19 |
+
sciq,acc_norm,0.911,0.009008893392651523,0
|
20 |
+
storycloze_2016,acc,0.7413148049171566,0.010126662138021714,0
|
21 |
+
winogrande,acc,0.6243093922651933,0.013611257508380444,0
|
8b7178b58b/evaluation/rankeval/8b7178b58b_1_lm-eval_global_step84877_2023-01-31-11-38-06_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.293,
|
5 |
-
"acc_stderr": 0.014399942998441271
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.327,
|
9 |
-
"acc_stderr": 0.01484221315341124
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3383333333333333,
|
13 |
-
"acc_stderr": 0.013664144006618275
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.32142857142857145,
|
17 |
-
"acc_stderr": 0.06297362289056341,
|
18 |
-
"f1": 0.2706949089557785
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.77,
|
22 |
-
"acc_stderr": 0.042295258468165065
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.5265883290181239,
|
26 |
-
"acc_stderr": 0.0049827214724073405,
|
27 |
-
"acc_norm": 0.7029476199960167,
|
28 |
-
"acc_norm_stderr": 0.00456025908319738
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5595667870036101,
|
32 |
-
"acc_stderr": 0.029882123363118723
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6243093922651933,
|
36 |
-
"acc_stderr": 0.013611257508380444
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7413148049171566,
|
40 |
-
"acc_stderr": 0.010126662138021714
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6669724770642201,
|
44 |
-
"acc_stderr": 0.00824302391268888
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6759259259259259,
|
48 |
-
"acc_stderr": 0.009603728850095394,
|
49 |
-
"acc_norm": 0.640993265993266,
|
50 |
-
"acc_norm_stderr": 0.009843424713072176
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.33447098976109213,
|
54 |
-
"acc_stderr": 0.013787460322441384,
|
55 |
-
"acc_norm": 0.3438566552901024,
|
56 |
-
"acc_norm_stderr": 0.01388064457015621
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.928,
|
60 |
-
"acc_stderr": 0.008178195576218681,
|
61 |
-
"acc_norm": 0.911,
|
62 |
-
"acc_norm_stderr": 0.009008893392651523
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7763873775843307,
|
66 |
-
"acc_stderr": 0.009721489519176294,
|
67 |
-
"acc_norm": 0.7883569096844396,
|
68 |
-
"acc_norm_stderr": 0.009530351270479392
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8b7178b58b/evaluation/rankeval/8b7178b58b_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.295,0.014428554438445512,0
|
3 |
+
anli_r2,acc,0.312,0.014658474370509007,0
|
4 |
+
anli_r3,acc,0.33416666666666667,0.013622434813136769,0
|
5 |
+
arc_challenge,acc,0.3250853242320819,0.013688147309729122,0
|
6 |
+
arc_challenge,acc_norm,0.34812286689419797,0.013921008595179333,0
|
7 |
+
arc_easy,acc,0.6805555555555556,0.009567482017268095,0
|
8 |
+
arc_easy,acc_norm,0.6565656565656566,0.00974381736896003,0
|
9 |
+
boolq,acc,0.6626911314984709,0.008269171495741622,1
|
10 |
+
cb,acc,0.21428571428571427,0.055328333517248834,1
|
11 |
+
cb,f1,0.1865942028985507,,1
|
12 |
+
copa,acc,0.83,0.03775251680686371,0
|
13 |
+
hellaswag,acc,0.5261900019916351,0.0049829315659459545,0
|
14 |
+
hellaswag,acc_norm,0.702549292969528,0.004562022467161891,0
|
15 |
+
piqa,acc,0.7709466811751904,0.009804509865175504,0
|
16 |
+
piqa,acc_norm,0.7856365614798694,0.009574842136050964,0
|
17 |
+
rte,acc,0.5342960288808665,0.030025579819366426,0
|
18 |
+
sciq,acc,0.933,0.007910345983177549,0
|
19 |
+
sciq,acc_norm,0.92,0.008583336977753655,0
|
20 |
+
storycloze_2016,acc,0.7536076964190273,0.009964727533753548,0
|
21 |
+
winogrande,acc,0.6250986582478295,0.013605544523788,0
|