Update README.md
Browse files
README.md
CHANGED
@@ -5,12 +5,156 @@ license: cc
|
|
5 |
- **What is this?** Nothing interesting, just an experiment.
|
6 |
- **License:** CC-BY-NC
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
- **What is this?** Nothing interesting, just an experiment.
|
6 |
- **License:** CC-BY-NC
|
7 |
|
8 |
+
```
|
9 |
+
| Task |Version| Metric |Value | |Stderr|
|
10 |
+
|------------------------------------------------------|------:|--------------|-----:|---|-----:|
|
11 |
+
|all | |acc |0.6502|± |0.0327|
|
12 |
+
| | |acc_norm |0.6414|± |0.0095|
|
13 |
+
| | |truthfulqa_mc1|0.3696|± |0.0169|
|
14 |
+
| | |truthfulqa_mc2|0.5305|± |0.0159|
|
15 |
+
| | |qem |0.4670|± |0.0137|
|
16 |
+
|leaderboard:arc:challenge:25 | 0|acc |0.5555|± |0.0145|
|
17 |
+
| | |acc_norm |0.5623|± |0.0145|
|
18 |
+
|leaderboard:gsm8k:5 | 0|qem |0.4670|± |0.0137|
|
19 |
+
|leaderboard:hellaswag:10 | 0|acc |0.5598|± |0.0050|
|
20 |
+
| | |acc_norm |0.7205|± |0.0045|
|
21 |
+
|leaderboard:mmlu:_average:5 | |acc |0.6527|± |0.0338|
|
22 |
+
|leaderboard:mmlu:abstract_algebra:5 | 0|acc |0.3300|± |0.0473|
|
23 |
+
|leaderboard:mmlu:anatomy:5 | 0|acc |0.6593|± |0.0409|
|
24 |
+
|leaderboard:mmlu:astronomy:5 | 0|acc |0.7303|± |0.0361|
|
25 |
+
|leaderboard:mmlu:business_ethics:5 | 0|acc |0.6700|± |0.0473|
|
26 |
+
|leaderboard:mmlu:clinical_knowledge:5 | 0|acc |0.7321|± |0.0273|
|
27 |
+
|leaderboard:mmlu:college_biology:5 | 0|acc |0.7708|± |0.0351|
|
28 |
+
|leaderboard:mmlu:college_chemistry:5 | 0|acc |0.4900|± |0.0502|
|
29 |
+
|leaderboard:mmlu:college_computer_science:5 | 0|acc |0.4600|± |0.0501|
|
30 |
+
|leaderboard:mmlu:college_mathematics:5 | 0|acc |0.3900|± |0.0490|
|
31 |
+
|leaderboard:mmlu:college_medicine:5 | 0|acc |0.6069|± |0.0372|
|
32 |
+
|leaderboard:mmlu:college_physics:5 | 0|acc |0.4706|± |0.0497|
|
33 |
+
|leaderboard:mmlu:computer_security:5 | 0|acc |0.7800|± |0.0416|
|
34 |
+
|leaderboard:mmlu:conceptual_physics:5 | 0|acc |0.5830|± |0.0322|
|
35 |
+
|leaderboard:mmlu:econometrics:5 | 0|acc |0.5000|± |0.0470|
|
36 |
+
|leaderboard:mmlu:electrical_engineering:5 | 0|acc |0.5862|± |0.0410|
|
37 |
+
|leaderboard:mmlu:elementary_mathematics:5 | 0|acc |0.4630|± |0.0257|
|
38 |
+
|leaderboard:mmlu:formal_logic:5 | 0|acc |0.5238|± |0.0447|
|
39 |
+
|leaderboard:mmlu:global_facts:5 | 0|acc |0.4300|± |0.0498|
|
40 |
+
|leaderboard:mmlu:high_school_biology:5 | 0|acc |0.7581|± |0.0244|
|
41 |
+
|leaderboard:mmlu:high_school_chemistry:5 | 0|acc |0.5271|± |0.0351|
|
42 |
+
|leaderboard:mmlu:high_school_computer_science:5 | 0|acc |0.6600|± |0.0476|
|
43 |
+
|leaderboard:mmlu:high_school_european_history:5 | 0|acc |0.7212|± |0.0350|
|
44 |
+
|leaderboard:mmlu:high_school_geography:5 | 0|acc |0.7929|± |0.0289|
|
45 |
+
|leaderboard:mmlu:high_school_government_and_politics:5| 0|acc |0.8756|± |0.0238|
|
46 |
+
|leaderboard:mmlu:high_school_macroeconomics:5 | 0|acc |0.6590|± |0.0240|
|
47 |
+
|leaderboard:mmlu:high_school_mathematics:5 | 0|acc |0.3407|± |0.0289|
|
48 |
+
|leaderboard:mmlu:high_school_microeconomics:5 | 0|acc |0.7563|± |0.0279|
|
49 |
+
|leaderboard:mmlu:high_school_physics:5 | 0|acc |0.4503|± |0.0406|
|
50 |
+
|leaderboard:mmlu:high_school_psychology:5 | 0|acc |0.8294|± |0.0161|
|
51 |
+
|leaderboard:mmlu:high_school_statistics:5 | 0|acc |0.4954|± |0.0341|
|
52 |
+
|leaderboard:mmlu:high_school_us_history:5 | 0|acc |0.8039|± |0.0279|
|
53 |
+
|leaderboard:mmlu:high_school_world_history:5 | 0|acc |0.8186|± |0.0251|
|
54 |
+
|leaderboard:mmlu:human_aging:5 | 0|acc |0.6951|± |0.0309|
|
55 |
+
|leaderboard:mmlu:human_sexuality:5 | 0|acc |0.7863|± |0.0360|
|
56 |
+
|leaderboard:mmlu:international_law:5 | 0|acc |0.8017|± |0.0364|
|
57 |
+
|leaderboard:mmlu:jurisprudence:5 | 0|acc |0.8056|± |0.0383|
|
58 |
+
|leaderboard:mmlu:logical_fallacies:5 | 0|acc |0.7362|± |0.0346|
|
59 |
+
|leaderboard:mmlu:machine_learning:5 | 0|acc |0.4911|± |0.0475|
|
60 |
+
|leaderboard:mmlu:management:5 | 0|acc |0.8252|± |0.0376|
|
61 |
+
|leaderboard:mmlu:marketing:5 | 0|acc |0.8718|± |0.0219|
|
62 |
+
|leaderboard:mmlu:medical_genetics:5 | 0|acc |0.6900|± |0.0465|
|
63 |
+
|leaderboard:mmlu:miscellaneous:5 | 0|acc |0.8225|± |0.0137|
|
64 |
+
|leaderboard:mmlu:moral_disputes:5 | 0|acc |0.7052|± |0.0245|
|
65 |
+
|leaderboard:mmlu:moral_scenarios:5 | 0|acc |0.4190|± |0.0165|
|
66 |
+
|leaderboard:mmlu:nutrition:5 | 0|acc |0.7353|± |0.0253|
|
67 |
+
|leaderboard:mmlu:philosophy:5 | 0|acc |0.7203|± |0.0255|
|
68 |
+
|leaderboard:mmlu:prehistory:5 | 0|acc |0.6975|± |0.0256|
|
69 |
+
|leaderboard:mmlu:professional_accounting:5 | 0|acc |0.5035|± |0.0298|
|
70 |
+
|leaderboard:mmlu:professional_law:5 | 0|acc |0.4576|± |0.0127|
|
71 |
+
|leaderboard:mmlu:professional_medicine:5 | 0|acc |0.7132|± |0.0275|
|
72 |
+
|leaderboard:mmlu:professional_psychology:5 | 0|acc |0.6879|± |0.0187|
|
73 |
+
|leaderboard:mmlu:public_relations:5 | 0|acc |0.6545|± |0.0455|
|
74 |
+
|leaderboard:mmlu:security_studies:5 | 0|acc |0.7388|± |0.0281|
|
75 |
+
|leaderboard:mmlu:sociology:5 | 0|acc |0.8159|± |0.0274|
|
76 |
+
|leaderboard:mmlu:us_foreign_policy:5 | 0|acc |0.8500|± |0.0359|
|
77 |
+
|leaderboard:mmlu:virology:5 | 0|acc |0.5000|± |0.0389|
|
78 |
+
|leaderboard:mmlu:world_religions:5 | 0|acc |0.8129|± |0.0299|
|
79 |
+
|leaderboard:truthfulqa:mc:0 | 0|truthfulqa_mc1|0.3696|± |0.0169|
|
80 |
+
| | |truthfulqa_mc2|0.5305|± |0.0159|
|
81 |
+
|leaderboard:winogrande:5 | 0|acc |0.6938|± |0.0130|
|
82 |
+
```
|
83 |
|
84 |
+
Baseline:
|
85 |
|
86 |
+
```
|
87 |
+
| Task |Version| Metric |Value | |Stderr|
|
88 |
+
|------------------------------------------------------|------:|--------------|-----:|---|-----:|
|
89 |
+
|all | |acc |0.6635|± |0.0322|
|
90 |
+
| | |acc_norm |0.6569|± |0.0094|
|
91 |
+
| | |truthfulqa_mc1|0.3745|± |0.0169|
|
92 |
+
| | |truthfulqa_mc2|0.5338|± |0.0160|
|
93 |
+
| | |qem |0.6808|± |0.0128|
|
94 |
+
|leaderboard:arc:challenge:25 | 0|acc |0.5742|± |0.0144|
|
95 |
+
| | |acc_norm |0.5828|± |0.0144|
|
96 |
+
|leaderboard:gsm8k:5 | 0|qem |0.6808|± |0.0128|
|
97 |
+
|leaderboard:hellaswag:10 | 0|acc |0.5707|± |0.0049|
|
98 |
+
| | |acc_norm |0.7310|± |0.0044|
|
99 |
+
|leaderboard:mmlu:_average:5 | |acc |0.6662|± |0.0333|
|
100 |
+
|leaderboard:mmlu:abstract_algebra:5 | 0|acc |0.3300|± |0.0473|
|
101 |
+
|leaderboard:mmlu:anatomy:5 | 0|acc |0.6815|± |0.0402|
|
102 |
+
|leaderboard:mmlu:astronomy:5 | 0|acc |0.7500|± |0.0352|
|
103 |
+
|leaderboard:mmlu:business_ethics:5 | 0|acc |0.7000|± |0.0461|
|
104 |
+
|leaderboard:mmlu:clinical_knowledge:5 | 0|acc |0.7472|± |0.0267|
|
105 |
+
|leaderboard:mmlu:college_biology:5 | 0|acc |0.7917|± |0.0340|
|
106 |
+
|leaderboard:mmlu:college_chemistry:5 | 0|acc |0.4500|± |0.0500|
|
107 |
+
|leaderboard:mmlu:college_computer_science:5 | 0|acc |0.5200|± |0.0502|
|
108 |
+
|leaderboard:mmlu:college_mathematics:5 | 0|acc |0.3900|± |0.0490|
|
109 |
+
|leaderboard:mmlu:college_medicine:5 | 0|acc |0.6590|± |0.0361|
|
110 |
+
|leaderboard:mmlu:college_physics:5 | 0|acc |0.4314|± |0.0493|
|
111 |
+
|leaderboard:mmlu:computer_security:5 | 0|acc |0.7900|± |0.0409|
|
112 |
+
|leaderboard:mmlu:conceptual_physics:5 | 0|acc |0.5872|± |0.0322|
|
113 |
+
|leaderboard:mmlu:econometrics:5 | 0|acc |0.5439|± |0.0469|
|
114 |
+
|leaderboard:mmlu:electrical_engineering:5 | 0|acc |0.6138|± |0.0406|
|
115 |
+
|leaderboard:mmlu:elementary_mathematics:5 | 0|acc |0.4683|± |0.0257|
|
116 |
+
|leaderboard:mmlu:formal_logic:5 | 0|acc |0.5317|± |0.0446|
|
117 |
+
|leaderboard:mmlu:global_facts:5 | 0|acc |0.4600|± |0.0501|
|
118 |
+
|leaderboard:mmlu:high_school_biology:5 | 0|acc |0.8065|± |0.0225|
|
119 |
+
|leaderboard:mmlu:high_school_chemistry:5 | 0|acc |0.5419|± |0.0351|
|
120 |
+
|leaderboard:mmlu:high_school_computer_science:5 | 0|acc |0.6800|± |0.0469|
|
121 |
+
|leaderboard:mmlu:high_school_european_history:5 | 0|acc |0.7394|± |0.0343|
|
122 |
+
|leaderboard:mmlu:high_school_geography:5 | 0|acc |0.8131|± |0.0278|
|
123 |
+
|leaderboard:mmlu:high_school_government_and_politics:5| 0|acc |0.8964|± |0.0220|
|
124 |
+
|leaderboard:mmlu:high_school_macroeconomics:5 | 0|acc |0.6769|± |0.0237|
|
125 |
+
|leaderboard:mmlu:high_school_mathematics:5 | 0|acc |0.3259|± |0.0286|
|
126 |
+
|leaderboard:mmlu:high_school_microeconomics:5 | 0|acc |0.7563|± |0.0279|
|
127 |
+
|leaderboard:mmlu:high_school_physics:5 | 0|acc |0.4106|± |0.0402|
|
128 |
+
|leaderboard:mmlu:high_school_psychology:5 | 0|acc |0.8477|± |0.0154|
|
129 |
+
|leaderboard:mmlu:high_school_statistics:5 | 0|acc |0.4769|± |0.0341|
|
130 |
+
|leaderboard:mmlu:high_school_us_history:5 | 0|acc |0.7892|± |0.0286|
|
131 |
+
|leaderboard:mmlu:high_school_world_history:5 | 0|acc |0.8397|± |0.0239|
|
132 |
+
|leaderboard:mmlu:human_aging:5 | 0|acc |0.7265|± |0.0299|
|
133 |
+
|leaderboard:mmlu:human_sexuality:5 | 0|acc |0.7939|± |0.0355|
|
134 |
+
|leaderboard:mmlu:international_law:5 | 0|acc |0.7686|± |0.0385|
|
135 |
+
|leaderboard:mmlu:jurisprudence:5 | 0|acc |0.7593|± |0.0413|
|
136 |
+
|leaderboard:mmlu:logical_fallacies:5 | 0|acc |0.7607|± |0.0335|
|
137 |
+
|leaderboard:mmlu:machine_learning:5 | 0|acc |0.5268|± |0.0474|
|
138 |
+
|leaderboard:mmlu:management:5 | 0|acc |0.8155|± |0.0384|
|
139 |
+
|leaderboard:mmlu:marketing:5 | 0|acc |0.9060|± |0.0191|
|
140 |
+
|leaderboard:mmlu:medical_genetics:5 | 0|acc |0.7900|± |0.0409|
|
141 |
+
|leaderboard:mmlu:miscellaneous:5 | 0|acc |0.8238|± |0.0136|
|
142 |
+
|leaderboard:mmlu:moral_disputes:5 | 0|acc |0.7399|± |0.0236|
|
143 |
+
|leaderboard:mmlu:moral_scenarios:5 | 0|acc |0.4358|± |0.0166|
|
144 |
+
|leaderboard:mmlu:nutrition:5 | 0|acc |0.7549|± |0.0246|
|
145 |
+
|leaderboard:mmlu:philosophy:5 | 0|acc |0.7331|± |0.0251|
|
146 |
+
|leaderboard:mmlu:prehistory:5 | 0|acc |0.7469|± |0.0242|
|
147 |
+
|leaderboard:mmlu:professional_accounting:5 | 0|acc |0.5177|± |0.0298|
|
148 |
+
|leaderboard:mmlu:professional_law:5 | 0|acc |0.4648|± |0.0127|
|
149 |
+
|leaderboard:mmlu:professional_medicine:5 | 0|acc |0.7279|± |0.0270|
|
150 |
+
|leaderboard:mmlu:professional_psychology:5 | 0|acc |0.6928|± |0.0187|
|
151 |
+
|leaderboard:mmlu:public_relations:5 | 0|acc |0.6636|± |0.0453|
|
152 |
+
|leaderboard:mmlu:security_studies:5 | 0|acc |0.7306|± |0.0284|
|
153 |
+
|leaderboard:mmlu:sociology:5 | 0|acc |0.8557|± |0.0248|
|
154 |
+
|leaderboard:mmlu:us_foreign_policy:5 | 0|acc |0.8600|± |0.0349|
|
155 |
+
|leaderboard:mmlu:virology:5 | 0|acc |0.5361|± |0.0388|
|
156 |
+
|leaderboard:mmlu:world_religions:5 | 0|acc |0.7953|± |0.0309|
|
157 |
+
|leaderboard:truthfulqa:mc:0 | 0|truthfulqa_mc1|0.3745|± |0.0169|
|
158 |
+
| | |truthfulqa_mc2|0.5338|± |0.0160|
|
159 |
+
|leaderboard:winogrande:5 | 0|acc |0.6930|± |0.0130|
|
160 |
+
```
|