yossiovadia's picture
Update to 14-category MMLU-Pro model - fixes #584 #591
f504ee5 verified
{
"validation_accuracy": 0.8526315789473684,
"test_accuracy": 0.8343490304709141,
"validation_report": {
"biology": {
"precision": 0.8425925925925926,
"recall": 0.8425925925925926,
"f1-score": 0.8425925925925926,
"support": 108.0
},
"business": {
"precision": 0.8695652173913043,
"recall": 0.8403361344537815,
"f1-score": 0.8547008547008547,
"support": 119.0
},
"chemistry": {
"precision": 0.861271676300578,
"recall": 0.8764705882352941,
"f1-score": 0.8688046647230321,
"support": 170.0
},
"computer science": {
"precision": 0.8507462686567164,
"recall": 0.9193548387096774,
"f1-score": 0.8837209302325582,
"support": 62.0
},
"economics": {
"precision": 0.7986111111111112,
"recall": 0.905511811023622,
"f1-score": 0.8487084870848709,
"support": 127.0
},
"engineering": {
"precision": 0.8776978417266187,
"recall": 0.8413793103448276,
"f1-score": 0.8591549295774648,
"support": 145.0
},
"health": {
"precision": 0.8095238095238095,
"recall": 0.8292682926829268,
"f1-score": 0.8192771084337349,
"support": 123.0
},
"history": {
"precision": 0.896551724137931,
"recall": 0.9122807017543859,
"f1-score": 0.9043478260869565,
"support": 57.0
},
"law": {
"precision": 0.9585798816568047,
"recall": 0.9818181818181818,
"f1-score": 0.9700598802395209,
"support": 165.0
},
"math": {
"precision": 0.8888888888888888,
"recall": 0.9108910891089109,
"f1-score": 0.8997555012224939,
"support": 202.0
},
"other": {
"precision": 0.7916666666666666,
"recall": 0.6884057971014492,
"f1-score": 0.7364341085271318,
"support": 138.0
},
"philosophy": {
"precision": 0.8375,
"recall": 0.8933333333333333,
"f1-score": 0.864516129032258,
"support": 75.0
},
"physics": {
"precision": 0.8157894736842105,
"recall": 0.7948717948717948,
"f1-score": 0.8051948051948052,
"support": 195.0
},
"psychology": {
"precision": 0.8073394495412844,
"recall": 0.7394957983193278,
"f1-score": 0.7719298245614035,
"support": 119.0
},
"accuracy": 0.8526315789473684,
"macro avg": {
"precision": 0.8504517572770369,
"recall": 0.8554293045964361,
"f1-score": 0.8520855458721199,
"support": 1805.0
},
"weighted avg": {
"precision": 0.8520451342930214,
"recall": 0.8526315789473684,
"f1-score": 0.8515396065019253,
"support": 1805.0
}
},
"test_report": {
"biology": {
"precision": 0.8461538461538461,
"recall": 0.822429906542056,
"f1-score": 0.8341232227488151,
"support": 107.0
},
"business": {
"precision": 0.8648648648648649,
"recall": 0.8135593220338984,
"f1-score": 0.8384279475982532,
"support": 118.0
},
"chemistry": {
"precision": 0.8170731707317073,
"recall": 0.788235294117647,
"f1-score": 0.8023952095808383,
"support": 170.0
},
"computer science": {
"precision": 0.8301886792452831,
"recall": 0.7213114754098361,
"f1-score": 0.7719298245614035,
"support": 61.0
},
"economics": {
"precision": 0.8688524590163934,
"recall": 0.8412698412698413,
"f1-score": 0.8548387096774194,
"support": 126.0
},
"engineering": {
"precision": 0.8394160583941606,
"recall": 0.7876712328767124,
"f1-score": 0.8127208480565371,
"support": 146.0
},
"health": {
"precision": 0.8790322580645161,
"recall": 0.8861788617886179,
"f1-score": 0.8825910931174089,
"support": 123.0
},
"history": {
"precision": 0.9433962264150944,
"recall": 0.8771929824561403,
"f1-score": 0.9090909090909091,
"support": 57.0
},
"law": {
"precision": 0.9457831325301205,
"recall": 0.9515151515151515,
"f1-score": 0.9486404833836858,
"support": 165.0
},
"math": {
"precision": 0.8440366972477065,
"recall": 0.9064039408866995,
"f1-score": 0.8741092636579573,
"support": 203.0
},
"other": {
"precision": 0.6923076923076923,
"recall": 0.7122302158273381,
"f1-score": 0.7021276595744681,
"support": 139.0
},
"philosophy": {
"precision": 0.8333333333333334,
"recall": 0.9333333333333333,
"f1-score": 0.8805031446540881,
"support": 75.0
},
"physics": {
"precision": 0.7853658536585366,
"recall": 0.8256410256410256,
"f1-score": 0.805,
"support": 195.0
},
"psychology": {
"precision": 0.768595041322314,
"recall": 0.775,
"f1-score": 0.7717842323651453,
"support": 120.0
},
"accuracy": 0.8343490304709141,
"macro avg": {
"precision": 0.8398856652346834,
"recall": 0.8315694702641643,
"f1-score": 0.8348773248619237,
"support": 1805.0
},
"weighted avg": {
"precision": 0.8352151540840406,
"recall": 0.8343490304709141,
"f1-score": 0.8341367555565297,
"support": 1805.0
}
},
"validation_confusion_matrix": [
[
91,
0,
2,
0,
1,
0,
9,
0,
0,
0,
2,
0,
0,
3
],
[
0,
100,
1,
0,
10,
0,
0,
0,
0,
3,
2,
2,
0,
1
],
[
1,
0,
149,
0,
0,
5,
1,
0,
0,
1,
0,
0,
13,
0
],
[
0,
0,
0,
57,
0,
0,
0,
0,
0,
3,
2,
0,
0,
0
],
[
1,
5,
0,
0,
115,
0,
0,
0,
1,
2,
1,
2,
0,
0
],
[
0,
0,
3,
5,
0,
122,
0,
0,
0,
2,
0,
0,
13,
0
],
[
5,
0,
1,
0,
1,
0,
102,
0,
1,
0,
5,
3,
1,
4
],
[
1,
0,
0,
0,
0,
0,
1,
52,
0,
0,
2,
1,
0,
0
],
[
0,
0,
0,
0,
1,
0,
1,
0,
162,
0,
1,
0,
0,
0
],
[
0,
3,
0,
2,
2,
1,
1,
0,
1,
184,
1,
0,
3,
4
],
[
2,
5,
2,
1,
10,
0,
3,
3,
2,
2,
95,
2,
4,
7
],
[
1,
0,
0,
0,
1,
0,
1,
2,
1,
0,
1,
67,
0,
1
],
[
1,
0,
15,
0,
0,
11,
0,
0,
0,
9,
3,
0,
155,
1
],
[
5,
2,
0,
2,
3,
0,
7,
1,
1,
1,
5,
3,
1,
88
]
],
"test_confusion_matrix": [
[
88,
0,
0,
0,
1,
0,
6,
1,
2,
1,
1,
0,
1,
6
],
[
0,
96,
0,
0,
5,
0,
0,
0,
1,
6,
3,
2,
1,
4
],
[
2,
0,
134,
1,
0,
8,
0,
0,
0,
2,
2,
0,
21,
0
],
[
0,
0,
1,
44,
0,
3,
1,
0,
0,
9,
3,
0,
0,
0
],
[
1,
4,
0,
0,
106,
0,
0,
0,
1,
3,
6,
1,
0,
4
],
[
1,
0,
13,
1,
0,
115,
0,
0,
0,
6,
1,
0,
9,
0
],
[
6,
0,
1,
0,
1,
1,
109,
0,
0,
1,
2,
0,
0,
2
],
[
1,
0,
0,
0,
0,
0,
0,
50,
0,
0,
5,
0,
0,
1
],
[
0,
0,
0,
0,
0,
0,
0,
0,
157,
0,
4,
4,
0,
0
],
[
1,
2,
0,
5,
1,
0,
0,
0,
0,
184,
0,
0,
9,
1
],
[
0,
9,
1,
2,
7,
0,
2,
1,
4,
1,
99,
4,
2,
7
],
[
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
3,
70,
0,
2
],
[
1,
0,
14,
0,
0,
10,
0,
0,
0,
4,
4,
0,
161,
1
],
[
3,
0,
0,
0,
1,
0,
6,
1,
1,
1,
10,
3,
1,
93
]
]
}