macadeliccc
commited on
Commit
•
4dc6ca7
1
Parent(s):
1802d6f
Update README.md
Browse files
README.md
CHANGED
@@ -50,74 +50,5 @@ The model is capable of quality code, math, and logical reasoning. Try whatever
|
|
50 |
|
51 |
# Evaluations
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
AGIEval
|
56 |
-
Task Version Metric Value Stderr
|
57 |
-
agieval_aqua_rat 0 acc 24.41 ± 2.70
|
58 |
-
acc_norm 24.80 ± 2.72
|
59 |
-
agieval_logiqa_en 0 acc 35.79 ± 1.88
|
60 |
-
acc_norm 36.71 ± 1.89
|
61 |
-
agieval_lsat_ar 0 acc 23.48 ± 2.80
|
62 |
-
acc_norm 23.91 ± 2.82
|
63 |
-
agieval_lsat_lr 0 acc 49.22 ± 2.22
|
64 |
-
acc_norm 50.00 ± 2.22
|
65 |
-
agieval_lsat_rc 0 acc 63.94 ± 2.93
|
66 |
-
acc_norm 64.31 ± 2.93
|
67 |
-
agieval_sat_en 0 acc 77.18 ± 2.93
|
68 |
-
acc_norm 76.70 ± 2.95
|
69 |
-
agieval_sat_en_without_passage 0 acc 45.15 ± 3.48
|
70 |
-
acc_norm 44.66 ± 3.47
|
71 |
-
agieval_sat_math 0 acc 33.64 ± 3.19
|
72 |
-
acc_norm 30.00 ± 3.10
|
73 |
-
|
74 |
-
Average: 43.89%
|
75 |
-
GPT4All
|
76 |
-
Task Version Metric Value Stderr
|
77 |
-
arc_challenge 0 acc 61.86 ± 1.42
|
78 |
-
acc_norm 62.88 ± 1.41
|
79 |
-
arc_easy 0 acc 84.34 ± 0.75
|
80 |
-
acc_norm 80.47 ± 0.81
|
81 |
-
boolq 1 acc 86.88 ± 0.59
|
82 |
-
hellaswag 0 acc 68.56 ± 0.46
|
83 |
-
acc_norm 85.16 ± 0.35
|
84 |
-
openbookqa 0 acc 37.00 ± 2.16
|
85 |
-
acc_norm 47.80 ± 2.24
|
86 |
-
piqa 0 acc 82.21 ± 0.89
|
87 |
-
acc_norm 83.68 ± 0.86
|
88 |
-
winogrande 0 acc 77.98 ± 1.16
|
89 |
-
|
90 |
-
Average: 74.98%
|
91 |
-
TruthfulQA
|
92 |
-
Task Version Metric Value Stderr
|
93 |
-
truthfulqa_mc 1 mc1 47.37 ± 1.75
|
94 |
-
mc2 63.96 ± 1.57
|
95 |
-
|
96 |
-
Average: 63.96%
|
97 |
-
Bigbench
|
98 |
-
Task Version Metric Value Stderr
|
99 |
-
bigbench_causal_judgement 0 multiple_choice_grade 55.26 ± 3.62
|
100 |
-
bigbench_date_understanding 0 multiple_choice_grade 63.14 ± 2.51
|
101 |
-
bigbench_disambiguation_qa 0 multiple_choice_grade 42.64 ± 3.08
|
102 |
-
bigbench_geometric_shapes 0 multiple_choice_grade 22.84 ± 2.22
|
103 |
-
exact_str_match 3.34 ± 0.95
|
104 |
-
bigbench_logical_deduction_five_objects 0 multiple_choice_grade 36.60 ± 2.16
|
105 |
-
bigbench_logical_deduction_seven_objects 0 multiple_choice_grade 25.57 ± 1.65
|
106 |
-
bigbench_logical_deduction_three_objects 0 multiple_choice_grade 56.00 ± 2.87
|
107 |
-
bigbench_movie_recommendation 0 multiple_choice_grade 42.40 ± 2.21
|
108 |
-
bigbench_navigate 0 multiple_choice_grade 54.70 ± 1.57
|
109 |
-
bigbench_reasoning_about_colored_objects 0 multiple_choice_grade 62.90 ± 1.08
|
110 |
-
bigbench_ruin_names 0 multiple_choice_grade 53.35 ± 2.36
|
111 |
-
bigbench_salient_translation_error_detection 0 multiple_choice_grade 24.35 ± 1.36
|
112 |
-
bigbench_snarks 0 multiple_choice_grade 62.43 ± 3.61
|
113 |
-
bigbench_sports_understanding 0 multiple_choice_grade 70.28 ± 1.46
|
114 |
-
bigbench_temporal_sequences 0 multiple_choice_grade 41.30 ± 1.56
|
115 |
-
bigbench_tracking_shuffled_objects_five_objects 0 multiple_choice_grade 22.32 ± 1.18
|
116 |
-
bigbench_tracking_shuffled_objects_seven_objects 0 multiple_choice_grade 17.77 ± 0.91
|
117 |
-
bigbench_tracking_shuffled_objects_three_objects 0 multiple_choice_grade 56.00 ± 2.87
|
118 |
-
|
119 |
-
Average: 44.99%
|
120 |
-
|
121 |
-
Average score: 56.96%
|
122 |
-
|
123 |
-
Elapsed time: 01:51:53
|
|
|
50 |
|
51 |
# Evaluations
|
52 |
|
53 |
+
TODO
|
54 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|