Update README.md
Browse files
README.md
CHANGED
@@ -3,6 +3,94 @@ library_name: transformers
|
|
3 |
tags: []
|
4 |
---
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
# Model Card for Model ID
|
7 |
|
8 |
<!-- Provide a quick summary of what the model is/does. -->
|
|
|
3 |
tags: []
|
4 |
---
|
5 |
|
6 |
+
# Evaluation Results
|
7 |
+
|
8 |
+
## Big-Bench Hard (BBH)
|
9 |
+
|
10 |
+
Note: These results are with corrected parsing for BBH from Eleuther's [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). See [this PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/2013).
|
11 |
+
|
12 |
+
### Smaug-Qwen2-72B-Instruct
|
13 |
+
|
14 |
+
#### Overall:
|
15 |
+
|
16 |
+
|Groups|Version| Filter |n-shot| Metric | |Value | |Stderr|
|
17 |
+
|------|-------|----------|-----:|-----------|---|-----:|---|-----:|
|
18 |
+
|bbh |N/A |get-answer| 3|exact_match|↑ |0.8241|± |0.0042|
|
19 |
+
|
20 |
+
#### Breakdown:
|
21 |
+
|
22 |
+
| Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
|
23 |
+
|----------------------------------------------------------|-------|----------|-----:|-----------|---|-----:|---|-----:|
|
24 |
+
|bbh |N/A |get-answer| 3|exact_match|↑ |0.8241|± |0.0042|
|
25 |
+
| - bbh_cot_fewshot_boolean_expressions | 2|get-answer| 3|exact_match|↑ |0.9640|± |0.0118|
|
26 |
+
| - bbh_cot_fewshot_causal_judgement | 2|get-answer| 3|exact_match|↑ |0.6578|± |0.0348|
|
27 |
+
| - bbh_cot_fewshot_date_understanding | 2|get-answer| 3|exact_match|↑ |0.8360|± |0.0235|
|
28 |
+
| - bbh_cot_fewshot_disambiguation_qa | 2|get-answer| 3|exact_match|↑ |0.8280|± |0.0239|
|
29 |
+
| - bbh_cot_fewshot_dyck_languages | 2|get-answer| 3|exact_match|↑ |0.3360|± |0.0299|
|
30 |
+
| - bbh_cot_fewshot_formal_fallacies | 2|get-answer| 3|exact_match|↑ |0.7120|± |0.0287|
|
31 |
+
| - bbh_cot_fewshot_geometric_shapes | 2|get-answer| 3|exact_match|↑ |0.5320|± |0.0316|
|
32 |
+
| - bbh_cot_fewshot_hyperbaton | 2|get-answer| 3|exact_match|↑ |0.9880|± |0.0069|
|
33 |
+
| - bbh_cot_fewshot_logical_deduction_five_objects | 2|get-answer| 3|exact_match|↑ |0.7680|± |0.0268|
|
34 |
+
| - bbh_cot_fewshot_logical_deduction_seven_objects | 2|get-answer| 3|exact_match|↑ |0.5360|± |0.0316|
|
35 |
+
| - bbh_cot_fewshot_logical_deduction_three_objects | 2|get-answer| 3|exact_match|↑ |0.9720|± |0.0105|
|
36 |
+
| - bbh_cot_fewshot_movie_recommendation | 2|get-answer| 3|exact_match|↑ |0.8000|± |0.0253|
|
37 |
+
| - bbh_cot_fewshot_multistep_arithmetic_two | 2|get-answer| 3|exact_match|↑ |0.9720|± |0.0105|
|
38 |
+
| - bbh_cot_fewshot_navigate | 2|get-answer| 3|exact_match|↑ |0.9640|± |0.0118|
|
39 |
+
| - bbh_cot_fewshot_object_counting | 2|get-answer| 3|exact_match|↑ |0.9200|± |0.0172|
|
40 |
+
| - bbh_cot_fewshot_penguins_in_a_table | 2|get-answer| 3|exact_match|↑ |0.8493|± |0.0297|
|
41 |
+
| - bbh_cot_fewshot_reasoning_about_colored_objects | 2|get-answer| 3|exact_match|↑ |0.7560|± |0.0272|
|
42 |
+
| - bbh_cot_fewshot_ruin_names | 2|get-answer| 3|exact_match|↑ |0.8520|± |0.0225|
|
43 |
+
| - bbh_cot_fewshot_salient_translation_error_detection | 2|get-answer| 3|exact_match|↑ |0.5920|± |0.0311|
|
44 |
+
| - bbh_cot_fewshot_snarks | 2|get-answer| 3|exact_match|↑ |0.9101|± |0.0215|
|
45 |
+
| - bbh_cot_fewshot_sports_understanding | 2|get-answer| 3|exact_match|↑ |0.9440|± |0.0146|
|
46 |
+
| - bbh_cot_fewshot_temporal_sequences | 2|get-answer| 3|exact_match|↑ |1.0000|± |0.0000|
|
47 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 2|get-answer| 3|exact_match|↑ |0.9800|± |0.0089|
|
48 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 2|get-answer| 3|exact_match|↑ |0.9560|± |0.0130|
|
49 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 2|get-answer| 3|exact_match|↑ |0.9640|± |0.0118|
|
50 |
+
| - bbh_cot_fewshot_web_of_lies | 2|get-answer| 3|exact_match|↑ |1.0000|± |0.0000|
|
51 |
+
| - bbh_cot_fewshot_word_sorting | 2|get-answer| 3|exact_match|↑ |0.6560|± |0.0301|
|
52 |
+
|
53 |
+
### Qwen2-72B-Instruct
|
54 |
+
|
55 |
+
#### Overall:
|
56 |
+
|
57 |
+
|Groups|Version| Filter |n-shot| Metric | |Value | |Stderr|
|
58 |
+
|------|-------|----------|-----:|-----------|---|-----:|---|-----:|
|
59 |
+
|bbh |N/A |get-answer| 3|exact_match|↑ |0.8036|± |0.0044|
|
60 |
+
|
61 |
+
#### Breakdown:
|
62 |
+
|
63 |
+
| Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
|
64 |
+
|----------------------------------------------------------|-------|----------|-----:|-----------|---|-----:|---|-----:|
|
65 |
+
|bbh |N/A |get-answer| 3|exact_match|↑ |0.8036|± |0.0044|
|
66 |
+
| - bbh_cot_fewshot_boolean_expressions | 2|get-answer| 3|exact_match|↑ |0.9640|± |0.0118|
|
67 |
+
| - bbh_cot_fewshot_causal_judgement | 2|get-answer| 3|exact_match|↑ |0.6684|± |0.0345|
|
68 |
+
| - bbh_cot_fewshot_date_understanding | 2|get-answer| 3|exact_match|↑ |0.8000|± |0.0253|
|
69 |
+
| - bbh_cot_fewshot_disambiguation_qa | 2|get-answer| 3|exact_match|↑ |0.8360|± |0.0235|
|
70 |
+
| - bbh_cot_fewshot_dyck_languages | 2|get-answer| 3|exact_match|↑ |0.3040|± |0.0292|
|
71 |
+
| - bbh_cot_fewshot_formal_fallacies | 2|get-answer| 3|exact_match|↑ |0.7480|± |0.0275|
|
72 |
+
| - bbh_cot_fewshot_geometric_shapes | 2|get-answer| 3|exact_match|↑ |0.4960|± |0.0317|
|
73 |
+
| - bbh_cot_fewshot_hyperbaton | 2|get-answer| 3|exact_match|↑ |0.9440|± |0.0146|
|
74 |
+
| - bbh_cot_fewshot_logical_deduction_five_objects | 2|get-answer| 3|exact_match|↑ |0.6800|± |0.0296|
|
75 |
+
| - bbh_cot_fewshot_logical_deduction_seven_objects | 2|get-answer| 3|exact_match|↑ |0.4720|± |0.0316|
|
76 |
+
| - bbh_cot_fewshot_logical_deduction_three_objects | 2|get-answer| 3|exact_match|↑ |0.9200|± |0.0172|
|
77 |
+
| - bbh_cot_fewshot_movie_recommendation | 2|get-answer| 3|exact_match|↑ |0.7800|± |0.0263|
|
78 |
+
| - bbh_cot_fewshot_multistep_arithmetic_two | 2|get-answer| 3|exact_match|↑ |0.9760|± |0.0097|
|
79 |
+
| - bbh_cot_fewshot_navigate | 2|get-answer| 3|exact_match|↑ |0.9520|± |0.0135|
|
80 |
+
| - bbh_cot_fewshot_object_counting | 2|get-answer| 3|exact_match|↑ |0.9480|± |0.0141|
|
81 |
+
| - bbh_cot_fewshot_penguins_in_a_table | 2|get-answer| 3|exact_match|↑ |0.5753|± |0.0410|
|
82 |
+
| - bbh_cot_fewshot_reasoning_about_colored_objects | 2|get-answer| 3|exact_match|↑ |0.8120|± |0.0248|
|
83 |
+
| - bbh_cot_fewshot_ruin_names | 2|get-answer| 3|exact_match|↑ |0.8760|± |0.0209|
|
84 |
+
| - bbh_cot_fewshot_salient_translation_error_detection | 2|get-answer| 3|exact_match|↑ |0.5880|± |0.0312|
|
85 |
+
| - bbh_cot_fewshot_snarks | 2|get-answer| 3|exact_match|↑ |0.8764|± |0.0247|
|
86 |
+
| - bbh_cot_fewshot_sports_understanding | 2|get-answer| 3|exact_match|↑ |0.9080|± |0.0183|
|
87 |
+
| - bbh_cot_fewshot_temporal_sequences | 2|get-answer| 3|exact_match|↑ |0.9960|± |0.0040|
|
88 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 2|get-answer| 3|exact_match|↑ |0.9160|± |0.0176|
|
89 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 2|get-answer| 3|exact_match|↑ |0.9400|± |0.0151|
|
90 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 2|get-answer| 3|exact_match|↑ |0.9440|± |0.0146|
|
91 |
+
| - bbh_cot_fewshot_web_of_lies | 2|get-answer| 3|exact_match|↑ |1.0000|± |0.0000|
|
92 |
+
| - bbh_cot_fewshot_word_sorting | 2|get-answer| 3|exact_match|↑ |0.6680|± |0.0298|
|
93 |
+
|
94 |
# Model Card for Model ID
|
95 |
|
96 |
<!-- Provide a quick summary of what the model is/does. -->
|