Update README.md
Browse files
README.md
CHANGED
@@ -72,6 +72,31 @@ or
|
|
72 |
<leave a newline blank for model to respond>
|
73 |
```
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
AGIEval
|
76 |
```
|
77 |
| Task |Version| Metric |Value | |Stderr|
|
|
|
72 |
<leave a newline blank for model to respond>
|
73 |
```
|
74 |
|
75 |
+
BigBench:
|
76 |
+
```
|
77 |
+
| Task |Version| Metric |Value | |Stderr|
|
78 |
+
|------------------------------------------------|------:|---------------------|-----:|---|-----:|
|
79 |
+
|bigbench_causal_judgement | 0|multiple_choice_grade|0.5579|± |0.0361|
|
80 |
+
|bigbench_date_understanding | 0|multiple_choice_grade|0.6233|± |0.0253|
|
81 |
+
|bigbench_disambiguation_qa | 0|multiple_choice_grade|0.3062|± |0.0288|
|
82 |
+
|bigbench_geometric_shapes | 0|multiple_choice_grade|0.2006|± |0.0212|
|
83 |
+
| | |exact_str_match |0.0000|± |0.0000|
|
84 |
+
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|0.2540|± |0.0195|
|
85 |
+
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|0.1657|± |0.0141|
|
86 |
+
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|0.4067|± |0.0284|
|
87 |
+
|bigbench_movie_recommendation | 0|multiple_choice_grade|0.2780|± |0.0201|
|
88 |
+
|bigbench_navigate | 0|multiple_choice_grade|0.5000|± |0.0158|
|
89 |
+
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|0.4405|± |0.0111|
|
90 |
+
|bigbench_ruin_names | 0|multiple_choice_grade|0.2701|± |0.0210|
|
91 |
+
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|0.2034|± |0.0127|
|
92 |
+
|bigbench_snarks | 0|multiple_choice_grade|0.5028|± |0.0373|
|
93 |
+
|bigbench_sports_understanding | 0|multiple_choice_grade|0.6136|± |0.0155|
|
94 |
+
|bigbench_temporal_sequences | 0|multiple_choice_grade|0.2720|± |0.0141|
|
95 |
+
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|0.1944|± |0.0112|
|
96 |
+
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|0.1497|± |0.0085|
|
97 |
+
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|0.4067|± |0.0284|
|
98 |
+
```
|
99 |
+
|
100 |
AGIEval
|
101 |
```
|
102 |
| Task |Version| Metric |Value | |Stderr|
|