Update README.md
Browse files
README.md
CHANGED
@@ -86,6 +86,76 @@ print(tokenizer.decode(outputs[0]))
|
|
86 |
|
87 |
# Nous Benchmark
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
|
91 |
### Training hyperparameters
|
|
|
86 |
|
87 |
# Nous Benchmark
|
88 |
|
89 |
+
Agieval
|
90 |
+
|
91 |
+
| Task | Version | Metric | Value | | StdErr |
|
92 |
+
|-------------------------------------------|---------|--------|-------|---|---------|
|
93 |
+
| agieval\_aqua\_rat | 0 | acc | 24.80 | _ | 2.72 |
|
94 |
+
| agieval\_aqua\_rat | 0 | acc\_norm | 24.80 | _ | 2.72 |
|
95 |
+
| agieval\_logiqa\_en | 0 | acc | 20.89 | _ | 1.59 |
|
96 |
+
| agieval\_logiqa\_en | 0 | acc\_norm | 23.35 | _ | 1.66 |
|
97 |
+
| agieval\_lsat\_ar | 0 | acc | 21.74 | _ | 2.73 |
|
98 |
+
| agieval\_lsat\_ar | 0 | acc\_norm | 20.43 | _ | 2.66 |
|
99 |
+
| agieval\_lsat\_lr | 0 | acc | 15.49 | _ | 1.60 |
|
100 |
+
| agieval\_lsat\_lr | 0 | acc\_norm | 20.59 | _ | 1.79 |
|
101 |
+
| agieval\_lsat\_rc | 0 | acc | 17.10 | _ | 2.30 |
|
102 |
+
| agieval\_lsat\_rc | 0 | acc\_norm | 17.84 | _ | 2.34 |
|
103 |
+
| agieval\_sat\_en | 0 | acc | 29.61 | _ | 3.19 |
|
104 |
+
| agieval\_sat\_en | 0 | acc\_norm | 29.61 | _ | 3.19 |
|
105 |
+
| agieval\_sat\_en\_without\_passage | 0 | acc | 26.21 | _ | 3.07 |
|
106 |
+
| agieval\_sat\_en\_without\_passage | 0 | acc\_norm | 24.76 | _ | 3.01 |
|
107 |
+
| agieval\_sat\_math | 0 | acc | 22.73 | _ | 2.83 |
|
108 |
+
| agieval\_sat\_math | 0 | acc\_norm | 22.73 | _ | 2.83 |
|
109 |
+
Average: 22.29
|
110 |
+
|
111 |
+
GPT4ALL
|
112 |
+
|
113 |
+
| Task | Version | Metric | Value | | StdErr |
|
114 |
+
|---------------|---------|------------|---------|---|-------------|
|
115 |
+
| arc_challenge | 0 | acc | 20.14 | _ | 1.17 |
|
116 |
+
| arc_challenge | 0 | acc_norm | 22.87 | _ | 1.23 |
|
117 |
+
| arc_easy | 0 | acc | 32.37 | _ | 0.96 |
|
118 |
+
| arc_easy | 0 | acc_norm | 31.61 | _ | 0.95 |
|
119 |
+
| boolq | 1 | acc | 45.78 | _ | 0.87 |
|
120 |
+
| hellaswag | 0 | acc | 32.03 | _ | 0.47 |
|
121 |
+
| hellaswag | 0 | acc_norm | 35.18 | _ | 0.48 |
|
122 |
+
| openbookqa | 0 | acc | 17.8 | _ | 1.71 |
|
123 |
+
| openbookqa | 0 | acc_norm | 29.8 | _ | 2.05 |
|
124 |
+
| piqa | 0 | acc | 54.46 | _ | 1.16 |
|
125 |
+
| piqa | 0 | acc_norm | 54.57 | _ | 1.16 |
|
126 |
+
| winogrande | 0 | acc | 48.30 | _ | 1.40 |
|
127 |
+
Average: 32.00
|
128 |
+
|
129 |
+
|
130 |
+
TruthfulQA
|
131 |
+
|
132 |
+
| Task | Version | Metric | Value | Std Err |
|
133 |
+
|----------------------------------|---------|--------|--------|----------|
|
134 |
+
| truthfulqa\_mc | 1 | mc1 | 30.11 | 1.61 |
|
135 |
+
| truthfulqa\_mc | 1 | mc2 | 47.69 | 1.61 |
|
136 |
+
Average: 38.90
|
137 |
+
|
138 |
+
|
139 |
+
# Openllm Benchmark
|
140 |
+
|
141 |
+
| Task |Version| Metric |Value| |Stderr|
|
142 |
+
|-------------|------:|--------|----:|---|-----:|
|
143 |
+
|arc_challenge| 0|acc |48.12|± | 1.46|
|
144 |
+
| | |acc_norm|51.27|± | 1.46|
|
145 |
+
|hellaswag | 0|acc |55.4 |± | 0.49|
|
146 |
+
| | |acc_norm|71.92|± | 0.42|
|
147 |
+
|gsm8k | 0|acc |29.87|± | 1.2 |
|
148 |
+
|winogrande | 0|acc |68.19|± | 1.3 |
|
149 |
+
|mmlu | 0|acc |53.62 |±| 0.6 |
|
150 |
+
|
151 |
+
Average: 73.5%
|
152 |
+
|
153 |
+
### TruthfulQA
|
154 |
+
| Task |Version|Metric|Value| |Stderr|
|
155 |
+
|-------------|------:|------|----:|---|-----:|
|
156 |
+
|truthfulqa_mc| 1|mc1 |30.23|± | 1.60|
|
157 |
+
| | |mc2 |47.17|± | 1.63|
|
158 |
+
|
159 |
|
160 |
|
161 |
### Training hyperparameters
|