leaderboard-pr-bot
commited on
Commit
•
0fc115c
1
Parent(s):
13f00a0
Adding Evaluation Results
Browse filesThis is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr
The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions
README.md
CHANGED
@@ -1,19 +1,19 @@
|
|
1 |
---
|
2 |
-
pipeline_tag: text-generation
|
3 |
-
inference: true
|
4 |
license: apache-2.0
|
5 |
-
datasets:
|
6 |
-
- GritLM/tulu2
|
7 |
tags:
|
8 |
- mteb
|
|
|
|
|
|
|
|
|
9 |
model-index:
|
10 |
- name: GritLM-7B
|
11 |
results:
|
12 |
- task:
|
13 |
type: Classification
|
14 |
dataset:
|
15 |
-
type: mteb/amazon_counterfactual
|
16 |
name: MTEB AmazonCounterfactualClassification (en)
|
|
|
17 |
config: en
|
18 |
split: test
|
19 |
revision: e8379541af4e31359cca9fbcf4b00f2671dba205
|
@@ -27,8 +27,8 @@ model-index:
|
|
27 |
- task:
|
28 |
type: Classification
|
29 |
dataset:
|
30 |
-
type: mteb/amazon_polarity
|
31 |
name: MTEB AmazonPolarityClassification
|
|
|
32 |
config: default
|
33 |
split: test
|
34 |
revision: e2d317d38cd51312af73b3d32a06d1a08b442046
|
@@ -42,8 +42,8 @@ model-index:
|
|
42 |
- task:
|
43 |
type: Classification
|
44 |
dataset:
|
45 |
-
type: mteb/amazon_reviews_multi
|
46 |
name: MTEB AmazonReviewsClassification (en)
|
|
|
47 |
config: en
|
48 |
split: test
|
49 |
revision: 1399c76144fd37290681b995c656ef9b2e06e26d
|
@@ -55,8 +55,8 @@ model-index:
|
|
55 |
- task:
|
56 |
type: Retrieval
|
57 |
dataset:
|
58 |
-
type: arguana
|
59 |
name: MTEB ArguAna
|
|
|
60 |
config: default
|
61 |
split: test
|
62 |
revision: None
|
@@ -124,8 +124,8 @@ model-index:
|
|
124 |
- task:
|
125 |
type: Clustering
|
126 |
dataset:
|
127 |
-
type: mteb/arxiv-clustering-p2p
|
128 |
name: MTEB ArxivClusteringP2P
|
|
|
129 |
config: default
|
130 |
split: test
|
131 |
revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
|
@@ -135,8 +135,8 @@ model-index:
|
|
135 |
- task:
|
136 |
type: Clustering
|
137 |
dataset:
|
138 |
-
type: mteb/arxiv-clustering-s2s
|
139 |
name: MTEB ArxivClusteringS2S
|
|
|
140 |
config: default
|
141 |
split: test
|
142 |
revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
|
@@ -146,8 +146,8 @@ model-index:
|
|
146 |
- task:
|
147 |
type: Reranking
|
148 |
dataset:
|
149 |
-
type: mteb/askubuntudupquestions-reranking
|
150 |
name: MTEB AskUbuntuDupQuestions
|
|
|
151 |
config: default
|
152 |
split: test
|
153 |
revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
|
@@ -159,8 +159,8 @@ model-index:
|
|
159 |
- task:
|
160 |
type: STS
|
161 |
dataset:
|
162 |
-
type: mteb/biosses-sts
|
163 |
name: MTEB BIOSSES
|
|
|
164 |
config: default
|
165 |
split: test
|
166 |
revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
|
@@ -180,8 +180,8 @@ model-index:
|
|
180 |
- task:
|
181 |
type: Classification
|
182 |
dataset:
|
183 |
-
type: mteb/banking77
|
184 |
name: MTEB Banking77Classification
|
|
|
185 |
config: default
|
186 |
split: test
|
187 |
revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
|
@@ -193,8 +193,8 @@ model-index:
|
|
193 |
- task:
|
194 |
type: Clustering
|
195 |
dataset:
|
196 |
-
type: mteb/biorxiv-clustering-p2p
|
197 |
name: MTEB BiorxivClusteringP2P
|
|
|
198 |
config: default
|
199 |
split: test
|
200 |
revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
|
@@ -204,8 +204,8 @@ model-index:
|
|
204 |
- task:
|
205 |
type: Clustering
|
206 |
dataset:
|
207 |
-
type: mteb/biorxiv-clustering-s2s
|
208 |
name: MTEB BiorxivClusteringS2S
|
|
|
209 |
config: default
|
210 |
split: test
|
211 |
revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
|
@@ -215,8 +215,8 @@ model-index:
|
|
215 |
- task:
|
216 |
type: Retrieval
|
217 |
dataset:
|
218 |
-
type: BeIR/cqadupstack
|
219 |
name: MTEB CQADupstackAndroidRetrieval
|
|
|
220 |
config: default
|
221 |
split: test
|
222 |
revision: None
|
@@ -281,15 +281,6 @@ model-index:
|
|
281 |
value: 56.58
|
282 |
- type: recall_at_5
|
283 |
value: 63.125
|
284 |
-
- task:
|
285 |
-
type: Retrieval
|
286 |
-
dataset:
|
287 |
-
type: BeIR/cqadupstack
|
288 |
-
name: MTEB CQADupstackEnglishRetrieval
|
289 |
-
config: default
|
290 |
-
split: test
|
291 |
-
revision: None
|
292 |
-
metrics:
|
293 |
- type: map_at_1
|
294 |
value: 38.025999999999996
|
295 |
- type: map_at_10
|
@@ -350,15 +341,6 @@ model-index:
|
|
350 |
value: 54.493
|
351 |
- type: recall_at_5
|
352 |
value: 59.64699999999999
|
353 |
-
- task:
|
354 |
-
type: Retrieval
|
355 |
-
dataset:
|
356 |
-
type: BeIR/cqadupstack
|
357 |
-
name: MTEB CQADupstackGamingRetrieval
|
358 |
-
config: default
|
359 |
-
split: test
|
360 |
-
revision: None
|
361 |
-
metrics:
|
362 |
- type: map_at_1
|
363 |
value: 47.905
|
364 |
- type: map_at_10
|
@@ -419,15 +401,6 @@ model-index:
|
|
419 |
value: 67.05600000000001
|
420 |
- type: recall_at_5
|
421 |
value: 74.261
|
422 |
-
- task:
|
423 |
-
type: Retrieval
|
424 |
-
dataset:
|
425 |
-
type: BeIR/cqadupstack
|
426 |
-
name: MTEB CQADupstackGisRetrieval
|
427 |
-
config: default
|
428 |
-
split: test
|
429 |
-
revision: None
|
430 |
-
metrics:
|
431 |
- type: map_at_1
|
432 |
value: 30.745
|
433 |
- type: map_at_10
|
@@ -488,15 +461,6 @@ model-index:
|
|
488 |
value: 45.378
|
489 |
- type: recall_at_5
|
490 |
value: 53.580000000000005
|
491 |
-
- task:
|
492 |
-
type: Retrieval
|
493 |
-
dataset:
|
494 |
-
type: BeIR/cqadupstack
|
495 |
-
name: MTEB CQADupstackMathematicaRetrieval
|
496 |
-
config: default
|
497 |
-
split: test
|
498 |
-
revision: None
|
499 |
-
metrics:
|
500 |
- type: map_at_1
|
501 |
value: 19.637999999999998
|
502 |
- type: map_at_10
|
@@ -557,15 +521,6 @@ model-index:
|
|
557 |
value: 36.384
|
558 |
- type: recall_at_5
|
559 |
value: 43.964
|
560 |
-
- task:
|
561 |
-
type: Retrieval
|
562 |
-
dataset:
|
563 |
-
type: BeIR/cqadupstack
|
564 |
-
name: MTEB CQADupstackPhysicsRetrieval
|
565 |
-
config: default
|
566 |
-
split: test
|
567 |
-
revision: None
|
568 |
-
metrics:
|
569 |
- type: map_at_1
|
570 |
value: 34.884
|
571 |
- type: map_at_10
|
@@ -626,15 +581,6 @@ model-index:
|
|
626 |
value: 52.428
|
627 |
- type: recall_at_5
|
628 |
value: 60.662000000000006
|
629 |
-
- task:
|
630 |
-
type: Retrieval
|
631 |
-
dataset:
|
632 |
-
type: BeIR/cqadupstack
|
633 |
-
name: MTEB CQADupstackProgrammersRetrieval
|
634 |
-
config: default
|
635 |
-
split: test
|
636 |
-
revision: None
|
637 |
-
metrics:
|
638 |
- type: map_at_1
|
639 |
value: 31.588
|
640 |
- type: map_at_10
|
@@ -695,15 +641,6 @@ model-index:
|
|
695 |
value: 47.128
|
696 |
- type: recall_at_5
|
697 |
value: 54.954
|
698 |
-
- task:
|
699 |
-
type: Retrieval
|
700 |
-
dataset:
|
701 |
-
type: BeIR/cqadupstack
|
702 |
-
name: MTEB CQADupstackRetrieval
|
703 |
-
config: default
|
704 |
-
split: test
|
705 |
-
revision: None
|
706 |
-
metrics:
|
707 |
- type: map_at_1
|
708 |
value: 31.956083333333336
|
709 |
- type: map_at_10
|
@@ -764,15 +701,6 @@ model-index:
|
|
764 |
value: 47.52016666666666
|
765 |
- type: recall_at_5
|
766 |
value: 54.36066666666666
|
767 |
-
- task:
|
768 |
-
type: Retrieval
|
769 |
-
dataset:
|
770 |
-
type: BeIR/cqadupstack
|
771 |
-
name: MTEB CQADupstackStatsRetrieval
|
772 |
-
config: default
|
773 |
-
split: test
|
774 |
-
revision: None
|
775 |
-
metrics:
|
776 |
- type: map_at_1
|
777 |
value: 28.912
|
778 |
- type: map_at_10
|
@@ -833,15 +761,6 @@ model-index:
|
|
833 |
value: 42.569
|
834 |
- type: recall_at_5
|
835 |
value: 48.719
|
836 |
-
- task:
|
837 |
-
type: Retrieval
|
838 |
-
dataset:
|
839 |
-
type: BeIR/cqadupstack
|
840 |
-
name: MTEB CQADupstackTexRetrieval
|
841 |
-
config: default
|
842 |
-
split: test
|
843 |
-
revision: None
|
844 |
-
metrics:
|
845 |
- type: map_at_1
|
846 |
value: 22.181
|
847 |
- type: map_at_10
|
@@ -902,15 +821,6 @@ model-index:
|
|
902 |
value: 35.003
|
903 |
- type: recall_at_5
|
904 |
value: 40.876000000000005
|
905 |
-
- task:
|
906 |
-
type: Retrieval
|
907 |
-
dataset:
|
908 |
-
type: BeIR/cqadupstack
|
909 |
-
name: MTEB CQADupstackUnixRetrieval
|
910 |
-
config: default
|
911 |
-
split: test
|
912 |
-
revision: None
|
913 |
-
metrics:
|
914 |
- type: map_at_1
|
915 |
value: 33.934999999999995
|
916 |
- type: map_at_10
|
@@ -971,15 +881,6 @@ model-index:
|
|
971 |
value: 47.439
|
972 |
- type: recall_at_5
|
973 |
value: 54.567
|
974 |
-
- task:
|
975 |
-
type: Retrieval
|
976 |
-
dataset:
|
977 |
-
type: BeIR/cqadupstack
|
978 |
-
name: MTEB CQADupstackWebmastersRetrieval
|
979 |
-
config: default
|
980 |
-
split: test
|
981 |
-
revision: None
|
982 |
-
metrics:
|
983 |
- type: map_at_1
|
984 |
value: 32.058
|
985 |
- type: map_at_10
|
@@ -1040,15 +941,6 @@ model-index:
|
|
1040 |
value: 47.509
|
1041 |
- type: recall_at_5
|
1042 |
value: 52.455
|
1043 |
-
- task:
|
1044 |
-
type: Retrieval
|
1045 |
-
dataset:
|
1046 |
-
type: BeIR/cqadupstack
|
1047 |
-
name: MTEB CQADupstackWordpressRetrieval
|
1048 |
-
config: default
|
1049 |
-
split: test
|
1050 |
-
revision: None
|
1051 |
-
metrics:
|
1052 |
- type: map_at_1
|
1053 |
value: 26.029000000000003
|
1054 |
- type: map_at_10
|
@@ -1112,8 +1004,8 @@ model-index:
|
|
1112 |
- task:
|
1113 |
type: Retrieval
|
1114 |
dataset:
|
1115 |
-
type: climate-fever
|
1116 |
name: MTEB ClimateFEVER
|
|
|
1117 |
config: default
|
1118 |
split: test
|
1119 |
revision: None
|
@@ -1181,8 +1073,8 @@ model-index:
|
|
1181 |
- task:
|
1182 |
type: Retrieval
|
1183 |
dataset:
|
1184 |
-
type: dbpedia-entity
|
1185 |
name: MTEB DBPedia
|
|
|
1186 |
config: default
|
1187 |
split: test
|
1188 |
revision: None
|
@@ -1250,8 +1142,8 @@ model-index:
|
|
1250 |
- task:
|
1251 |
type: Classification
|
1252 |
dataset:
|
1253 |
-
type: mteb/emotion
|
1254 |
name: MTEB EmotionClassification
|
|
|
1255 |
config: default
|
1256 |
split: test
|
1257 |
revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
|
@@ -1263,8 +1155,8 @@ model-index:
|
|
1263 |
- task:
|
1264 |
type: Retrieval
|
1265 |
dataset:
|
1266 |
-
type: fever
|
1267 |
name: MTEB FEVER
|
|
|
1268 |
config: default
|
1269 |
split: test
|
1270 |
revision: None
|
@@ -1332,8 +1224,8 @@ model-index:
|
|
1332 |
- task:
|
1333 |
type: Retrieval
|
1334 |
dataset:
|
1335 |
-
type: fiqa
|
1336 |
name: MTEB FiQA2018
|
|
|
1337 |
config: default
|
1338 |
split: test
|
1339 |
revision: None
|
@@ -1401,8 +1293,8 @@ model-index:
|
|
1401 |
- task:
|
1402 |
type: Retrieval
|
1403 |
dataset:
|
1404 |
-
type: hotpotqa
|
1405 |
name: MTEB HotpotQA
|
|
|
1406 |
config: default
|
1407 |
split: test
|
1408 |
revision: None
|
@@ -1470,8 +1362,8 @@ model-index:
|
|
1470 |
- task:
|
1471 |
type: Classification
|
1472 |
dataset:
|
1473 |
-
type: mteb/imdb
|
1474 |
name: MTEB ImdbClassification
|
|
|
1475 |
config: default
|
1476 |
split: test
|
1477 |
revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
|
@@ -1485,8 +1377,8 @@ model-index:
|
|
1485 |
- task:
|
1486 |
type: Retrieval
|
1487 |
dataset:
|
1488 |
-
type: msmarco
|
1489 |
name: MTEB MSMARCO
|
|
|
1490 |
config: default
|
1491 |
split: dev
|
1492 |
revision: None
|
@@ -1554,8 +1446,8 @@ model-index:
|
|
1554 |
- task:
|
1555 |
type: Classification
|
1556 |
dataset:
|
1557 |
-
type: mteb/mtop_domain
|
1558 |
name: MTEB MTOPDomainClassification (en)
|
|
|
1559 |
config: en
|
1560 |
split: test
|
1561 |
revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
|
@@ -1567,8 +1459,8 @@ model-index:
|
|
1567 |
- task:
|
1568 |
type: Classification
|
1569 |
dataset:
|
1570 |
-
type: mteb/mtop_intent
|
1571 |
name: MTEB MTOPIntentClassification (en)
|
|
|
1572 |
config: en
|
1573 |
split: test
|
1574 |
revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
|
@@ -1580,8 +1472,8 @@ model-index:
|
|
1580 |
- task:
|
1581 |
type: Classification
|
1582 |
dataset:
|
1583 |
-
type: mteb/amazon_massive_intent
|
1584 |
name: MTEB MassiveIntentClassification (en)
|
|
|
1585 |
config: en
|
1586 |
split: test
|
1587 |
revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
|
@@ -1593,8 +1485,8 @@ model-index:
|
|
1593 |
- task:
|
1594 |
type: Classification
|
1595 |
dataset:
|
1596 |
-
type: mteb/amazon_massive_scenario
|
1597 |
name: MTEB MassiveScenarioClassification (en)
|
|
|
1598 |
config: en
|
1599 |
split: test
|
1600 |
revision: 7d571f92784cd94a019292a1f45445077d0ef634
|
@@ -1606,8 +1498,8 @@ model-index:
|
|
1606 |
- task:
|
1607 |
type: Clustering
|
1608 |
dataset:
|
1609 |
-
type: mteb/medrxiv-clustering-p2p
|
1610 |
name: MTEB MedrxivClusteringP2P
|
|
|
1611 |
config: default
|
1612 |
split: test
|
1613 |
revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
|
@@ -1617,8 +1509,8 @@ model-index:
|
|
1617 |
- task:
|
1618 |
type: Clustering
|
1619 |
dataset:
|
1620 |
-
type: mteb/medrxiv-clustering-s2s
|
1621 |
name: MTEB MedrxivClusteringS2S
|
|
|
1622 |
config: default
|
1623 |
split: test
|
1624 |
revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
|
@@ -1628,8 +1520,8 @@ model-index:
|
|
1628 |
- task:
|
1629 |
type: Reranking
|
1630 |
dataset:
|
1631 |
-
type: mteb/mind_small
|
1632 |
name: MTEB MindSmallReranking
|
|
|
1633 |
config: default
|
1634 |
split: test
|
1635 |
revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69
|
@@ -1641,8 +1533,8 @@ model-index:
|
|
1641 |
- task:
|
1642 |
type: Retrieval
|
1643 |
dataset:
|
1644 |
-
type: nfcorpus
|
1645 |
name: MTEB NFCorpus
|
|
|
1646 |
config: default
|
1647 |
split: test
|
1648 |
revision: None
|
@@ -1710,8 +1602,8 @@ model-index:
|
|
1710 |
- task:
|
1711 |
type: Retrieval
|
1712 |
dataset:
|
1713 |
-
type: nq
|
1714 |
name: MTEB NQ
|
|
|
1715 |
config: default
|
1716 |
split: test
|
1717 |
revision: None
|
@@ -1779,8 +1671,8 @@ model-index:
|
|
1779 |
- task:
|
1780 |
type: Retrieval
|
1781 |
dataset:
|
1782 |
-
type: quora
|
1783 |
name: MTEB QuoraRetrieval
|
|
|
1784 |
config: default
|
1785 |
split: test
|
1786 |
revision: None
|
@@ -1848,8 +1740,8 @@ model-index:
|
|
1848 |
- task:
|
1849 |
type: Clustering
|
1850 |
dataset:
|
1851 |
-
type: mteb/reddit-clustering
|
1852 |
name: MTEB RedditClustering
|
|
|
1853 |
config: default
|
1854 |
split: test
|
1855 |
revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
|
@@ -1859,8 +1751,8 @@ model-index:
|
|
1859 |
- task:
|
1860 |
type: Clustering
|
1861 |
dataset:
|
1862 |
-
type: mteb/reddit-clustering-p2p
|
1863 |
name: MTEB RedditClusteringP2P
|
|
|
1864 |
config: default
|
1865 |
split: test
|
1866 |
revision: 282350215ef01743dc01b456c7f5241fa8937f16
|
@@ -1870,8 +1762,8 @@ model-index:
|
|
1870 |
- task:
|
1871 |
type: Retrieval
|
1872 |
dataset:
|
1873 |
-
type: scidocs
|
1874 |
name: MTEB SCIDOCS
|
|
|
1875 |
config: default
|
1876 |
split: test
|
1877 |
revision: None
|
@@ -1939,8 +1831,8 @@ model-index:
|
|
1939 |
- task:
|
1940 |
type: STS
|
1941 |
dataset:
|
1942 |
-
type: mteb/sickr-sts
|
1943 |
name: MTEB SICK-R
|
|
|
1944 |
config: default
|
1945 |
split: test
|
1946 |
revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
|
@@ -1960,8 +1852,8 @@ model-index:
|
|
1960 |
- task:
|
1961 |
type: STS
|
1962 |
dataset:
|
1963 |
-
type: mteb/sts12-sts
|
1964 |
name: MTEB STS12
|
|
|
1965 |
config: default
|
1966 |
split: test
|
1967 |
revision: a0d554a64d88156834ff5ae9920b964011b16384
|
@@ -1981,8 +1873,8 @@ model-index:
|
|
1981 |
- task:
|
1982 |
type: STS
|
1983 |
dataset:
|
1984 |
-
type: mteb/sts13-sts
|
1985 |
name: MTEB STS13
|
|
|
1986 |
config: default
|
1987 |
split: test
|
1988 |
revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
|
@@ -2002,8 +1894,8 @@ model-index:
|
|
2002 |
- task:
|
2003 |
type: STS
|
2004 |
dataset:
|
2005 |
-
type: mteb/sts14-sts
|
2006 |
name: MTEB STS14
|
|
|
2007 |
config: default
|
2008 |
split: test
|
2009 |
revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
|
@@ -2023,8 +1915,8 @@ model-index:
|
|
2023 |
- task:
|
2024 |
type: STS
|
2025 |
dataset:
|
2026 |
-
type: mteb/sts15-sts
|
2027 |
name: MTEB STS15
|
|
|
2028 |
config: default
|
2029 |
split: test
|
2030 |
revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
|
@@ -2044,8 +1936,8 @@ model-index:
|
|
2044 |
- task:
|
2045 |
type: STS
|
2046 |
dataset:
|
2047 |
-
type: mteb/sts16-sts
|
2048 |
name: MTEB STS16
|
|
|
2049 |
config: default
|
2050 |
split: test
|
2051 |
revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
|
@@ -2065,8 +1957,8 @@ model-index:
|
|
2065 |
- task:
|
2066 |
type: STS
|
2067 |
dataset:
|
2068 |
-
type: mteb/sts17-crosslingual-sts
|
2069 |
name: MTEB STS17 (en-en)
|
|
|
2070 |
config: en-en
|
2071 |
split: test
|
2072 |
revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
|
@@ -2086,8 +1978,8 @@ model-index:
|
|
2086 |
- task:
|
2087 |
type: STS
|
2088 |
dataset:
|
2089 |
-
type: mteb/sts22-crosslingual-sts
|
2090 |
name: MTEB STS22 (en)
|
|
|
2091 |
config: en
|
2092 |
split: test
|
2093 |
revision: eea2b4fe26a775864c896887d910b76a8098ad3f
|
@@ -2107,8 +1999,8 @@ model-index:
|
|
2107 |
- task:
|
2108 |
type: STS
|
2109 |
dataset:
|
2110 |
-
type: mteb/stsbenchmark-sts
|
2111 |
name: MTEB STSBenchmark
|
|
|
2112 |
config: default
|
2113 |
split: test
|
2114 |
revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
|
@@ -2128,8 +2020,8 @@ model-index:
|
|
2128 |
- task:
|
2129 |
type: Reranking
|
2130 |
dataset:
|
2131 |
-
type: mteb/scidocs-reranking
|
2132 |
name: MTEB SciDocsRR
|
|
|
2133 |
config: default
|
2134 |
split: test
|
2135 |
revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
|
@@ -2141,8 +2033,8 @@ model-index:
|
|
2141 |
- task:
|
2142 |
type: Retrieval
|
2143 |
dataset:
|
2144 |
-
type: scifact
|
2145 |
name: MTEB SciFact
|
|
|
2146 |
config: default
|
2147 |
split: test
|
2148 |
revision: None
|
@@ -2210,8 +2102,8 @@ model-index:
|
|
2210 |
- task:
|
2211 |
type: PairClassification
|
2212 |
dataset:
|
2213 |
-
type: mteb/sprintduplicatequestions-pairclassification
|
2214 |
name: MTEB SprintDuplicateQuestions
|
|
|
2215 |
config: default
|
2216 |
split: test
|
2217 |
revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
|
@@ -2265,8 +2157,8 @@ model-index:
|
|
2265 |
- task:
|
2266 |
type: Clustering
|
2267 |
dataset:
|
2268 |
-
type: mteb/stackexchange-clustering
|
2269 |
name: MTEB StackExchangeClustering
|
|
|
2270 |
config: default
|
2271 |
split: test
|
2272 |
revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
|
@@ -2276,8 +2168,8 @@ model-index:
|
|
2276 |
- task:
|
2277 |
type: Clustering
|
2278 |
dataset:
|
2279 |
-
type: mteb/stackexchange-clustering-p2p
|
2280 |
name: MTEB StackExchangeClusteringP2P
|
|
|
2281 |
config: default
|
2282 |
split: test
|
2283 |
revision: 815ca46b2622cec33ccafc3735d572c266efdb44
|
@@ -2287,8 +2179,8 @@ model-index:
|
|
2287 |
- task:
|
2288 |
type: Reranking
|
2289 |
dataset:
|
2290 |
-
type: mteb/stackoverflowdupquestions-reranking
|
2291 |
name: MTEB StackOverflowDupQuestions
|
|
|
2292 |
config: default
|
2293 |
split: test
|
2294 |
revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
|
@@ -2300,8 +2192,8 @@ model-index:
|
|
2300 |
- task:
|
2301 |
type: Summarization
|
2302 |
dataset:
|
2303 |
-
type: mteb/summeval
|
2304 |
name: MTEB SummEval
|
|
|
2305 |
config: default
|
2306 |
split: test
|
2307 |
revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
|
@@ -2317,8 +2209,8 @@ model-index:
|
|
2317 |
- task:
|
2318 |
type: Retrieval
|
2319 |
dataset:
|
2320 |
-
type: trec-covid
|
2321 |
name: MTEB TRECCOVID
|
|
|
2322 |
config: default
|
2323 |
split: test
|
2324 |
revision: None
|
@@ -2386,8 +2278,8 @@ model-index:
|
|
2386 |
- task:
|
2387 |
type: Retrieval
|
2388 |
dataset:
|
2389 |
-
type: webis-touche2020
|
2390 |
name: MTEB Touche2020
|
|
|
2391 |
config: default
|
2392 |
split: test
|
2393 |
revision: None
|
@@ -2455,8 +2347,8 @@ model-index:
|
|
2455 |
- task:
|
2456 |
type: Classification
|
2457 |
dataset:
|
2458 |
-
type: mteb/toxic_conversations_50k
|
2459 |
name: MTEB ToxicConversationsClassification
|
|
|
2460 |
config: default
|
2461 |
split: test
|
2462 |
revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c
|
@@ -2470,8 +2362,8 @@ model-index:
|
|
2470 |
- task:
|
2471 |
type: Classification
|
2472 |
dataset:
|
2473 |
-
type: mteb/tweet_sentiment_extraction
|
2474 |
name: MTEB TweetSentimentExtractionClassification
|
|
|
2475 |
config: default
|
2476 |
split: test
|
2477 |
revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
|
@@ -2483,8 +2375,8 @@ model-index:
|
|
2483 |
- task:
|
2484 |
type: Clustering
|
2485 |
dataset:
|
2486 |
-
type: mteb/twentynewsgroups-clustering
|
2487 |
name: MTEB TwentyNewsgroupsClustering
|
|
|
2488 |
config: default
|
2489 |
split: test
|
2490 |
revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
|
@@ -2494,8 +2386,8 @@ model-index:
|
|
2494 |
- task:
|
2495 |
type: PairClassification
|
2496 |
dataset:
|
2497 |
-
type: mteb/twittersemeval2015-pairclassification
|
2498 |
name: MTEB TwitterSemEval2015
|
|
|
2499 |
config: default
|
2500 |
split: test
|
2501 |
revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
|
@@ -2549,8 +2441,8 @@ model-index:
|
|
2549 |
- task:
|
2550 |
type: PairClassification
|
2551 |
dataset:
|
2552 |
-
type: mteb/twitterurlcorpus-pairclassification
|
2553 |
name: MTEB TwitterURLCorpus
|
|
|
2554 |
config: default
|
2555 |
split: test
|
2556 |
revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
|
@@ -2601,6 +2493,106 @@ model-index:
|
|
2601 |
value: 87.47814292587448
|
2602 |
- type: max_f1
|
2603 |
value: 80.15461150280949
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2604 |
---
|
2605 |
|
2606 |
# Model Summary
|
@@ -2632,4 +2624,17 @@ The model usage is documented [here](https://github.com/ContextualAI/gritlm?tab=
|
|
2632 |
archivePrefix={arXiv},
|
2633 |
primaryClass={cs.CL}
|
2634 |
}
|
2635 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
|
|
|
|
2 |
license: apache-2.0
|
|
|
|
|
3 |
tags:
|
4 |
- mteb
|
5 |
+
datasets:
|
6 |
+
- GritLM/tulu2
|
7 |
+
pipeline_tag: text-generation
|
8 |
+
inference: true
|
9 |
model-index:
|
10 |
- name: GritLM-7B
|
11 |
results:
|
12 |
- task:
|
13 |
type: Classification
|
14 |
dataset:
|
|
|
15 |
name: MTEB AmazonCounterfactualClassification (en)
|
16 |
+
type: mteb/amazon_counterfactual
|
17 |
config: en
|
18 |
split: test
|
19 |
revision: e8379541af4e31359cca9fbcf4b00f2671dba205
|
|
|
27 |
- task:
|
28 |
type: Classification
|
29 |
dataset:
|
|
|
30 |
name: MTEB AmazonPolarityClassification
|
31 |
+
type: mteb/amazon_polarity
|
32 |
config: default
|
33 |
split: test
|
34 |
revision: e2d317d38cd51312af73b3d32a06d1a08b442046
|
|
|
42 |
- task:
|
43 |
type: Classification
|
44 |
dataset:
|
|
|
45 |
name: MTEB AmazonReviewsClassification (en)
|
46 |
+
type: mteb/amazon_reviews_multi
|
47 |
config: en
|
48 |
split: test
|
49 |
revision: 1399c76144fd37290681b995c656ef9b2e06e26d
|
|
|
55 |
- task:
|
56 |
type: Retrieval
|
57 |
dataset:
|
|
|
58 |
name: MTEB ArguAna
|
59 |
+
type: arguana
|
60 |
config: default
|
61 |
split: test
|
62 |
revision: None
|
|
|
124 |
- task:
|
125 |
type: Clustering
|
126 |
dataset:
|
|
|
127 |
name: MTEB ArxivClusteringP2P
|
128 |
+
type: mteb/arxiv-clustering-p2p
|
129 |
config: default
|
130 |
split: test
|
131 |
revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
|
|
|
135 |
- task:
|
136 |
type: Clustering
|
137 |
dataset:
|
|
|
138 |
name: MTEB ArxivClusteringS2S
|
139 |
+
type: mteb/arxiv-clustering-s2s
|
140 |
config: default
|
141 |
split: test
|
142 |
revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
|
|
|
146 |
- task:
|
147 |
type: Reranking
|
148 |
dataset:
|
|
|
149 |
name: MTEB AskUbuntuDupQuestions
|
150 |
+
type: mteb/askubuntudupquestions-reranking
|
151 |
config: default
|
152 |
split: test
|
153 |
revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
|
|
|
159 |
- task:
|
160 |
type: STS
|
161 |
dataset:
|
|
|
162 |
name: MTEB BIOSSES
|
163 |
+
type: mteb/biosses-sts
|
164 |
config: default
|
165 |
split: test
|
166 |
revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
|
|
|
180 |
- task:
|
181 |
type: Classification
|
182 |
dataset:
|
|
|
183 |
name: MTEB Banking77Classification
|
184 |
+
type: mteb/banking77
|
185 |
config: default
|
186 |
split: test
|
187 |
revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
|
|
|
193 |
- task:
|
194 |
type: Clustering
|
195 |
dataset:
|
|
|
196 |
name: MTEB BiorxivClusteringP2P
|
197 |
+
type: mteb/biorxiv-clustering-p2p
|
198 |
config: default
|
199 |
split: test
|
200 |
revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
|
|
|
204 |
- task:
|
205 |
type: Clustering
|
206 |
dataset:
|
|
|
207 |
name: MTEB BiorxivClusteringS2S
|
208 |
+
type: mteb/biorxiv-clustering-s2s
|
209 |
config: default
|
210 |
split: test
|
211 |
revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
|
|
|
215 |
- task:
|
216 |
type: Retrieval
|
217 |
dataset:
|
|
|
218 |
name: MTEB CQADupstackAndroidRetrieval
|
219 |
+
type: BeIR/cqadupstack
|
220 |
config: default
|
221 |
split: test
|
222 |
revision: None
|
|
|
281 |
value: 56.58
|
282 |
- type: recall_at_5
|
283 |
value: 63.125
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
- type: map_at_1
|
285 |
value: 38.025999999999996
|
286 |
- type: map_at_10
|
|
|
341 |
value: 54.493
|
342 |
- type: recall_at_5
|
343 |
value: 59.64699999999999
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
- type: map_at_1
|
345 |
value: 47.905
|
346 |
- type: map_at_10
|
|
|
401 |
value: 67.05600000000001
|
402 |
- type: recall_at_5
|
403 |
value: 74.261
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
- type: map_at_1
|
405 |
value: 30.745
|
406 |
- type: map_at_10
|
|
|
461 |
value: 45.378
|
462 |
- type: recall_at_5
|
463 |
value: 53.580000000000005
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
- type: map_at_1
|
465 |
value: 19.637999999999998
|
466 |
- type: map_at_10
|
|
|
521 |
value: 36.384
|
522 |
- type: recall_at_5
|
523 |
value: 43.964
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
- type: map_at_1
|
525 |
value: 34.884
|
526 |
- type: map_at_10
|
|
|
581 |
value: 52.428
|
582 |
- type: recall_at_5
|
583 |
value: 60.662000000000006
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
584 |
- type: map_at_1
|
585 |
value: 31.588
|
586 |
- type: map_at_10
|
|
|
641 |
value: 47.128
|
642 |
- type: recall_at_5
|
643 |
value: 54.954
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
644 |
- type: map_at_1
|
645 |
value: 31.956083333333336
|
646 |
- type: map_at_10
|
|
|
701 |
value: 47.52016666666666
|
702 |
- type: recall_at_5
|
703 |
value: 54.36066666666666
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
704 |
- type: map_at_1
|
705 |
value: 28.912
|
706 |
- type: map_at_10
|
|
|
761 |
value: 42.569
|
762 |
- type: recall_at_5
|
763 |
value: 48.719
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
764 |
- type: map_at_1
|
765 |
value: 22.181
|
766 |
- type: map_at_10
|
|
|
821 |
value: 35.003
|
822 |
- type: recall_at_5
|
823 |
value: 40.876000000000005
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
824 |
- type: map_at_1
|
825 |
value: 33.934999999999995
|
826 |
- type: map_at_10
|
|
|
881 |
value: 47.439
|
882 |
- type: recall_at_5
|
883 |
value: 54.567
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
884 |
- type: map_at_1
|
885 |
value: 32.058
|
886 |
- type: map_at_10
|
|
|
941 |
value: 47.509
|
942 |
- type: recall_at_5
|
943 |
value: 52.455
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
944 |
- type: map_at_1
|
945 |
value: 26.029000000000003
|
946 |
- type: map_at_10
|
|
|
1004 |
- task:
|
1005 |
type: Retrieval
|
1006 |
dataset:
|
|
|
1007 |
name: MTEB ClimateFEVER
|
1008 |
+
type: climate-fever
|
1009 |
config: default
|
1010 |
split: test
|
1011 |
revision: None
|
|
|
1073 |
- task:
|
1074 |
type: Retrieval
|
1075 |
dataset:
|
|
|
1076 |
name: MTEB DBPedia
|
1077 |
+
type: dbpedia-entity
|
1078 |
config: default
|
1079 |
split: test
|
1080 |
revision: None
|
|
|
1142 |
- task:
|
1143 |
type: Classification
|
1144 |
dataset:
|
|
|
1145 |
name: MTEB EmotionClassification
|
1146 |
+
type: mteb/emotion
|
1147 |
config: default
|
1148 |
split: test
|
1149 |
revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
|
|
|
1155 |
- task:
|
1156 |
type: Retrieval
|
1157 |
dataset:
|
|
|
1158 |
name: MTEB FEVER
|
1159 |
+
type: fever
|
1160 |
config: default
|
1161 |
split: test
|
1162 |
revision: None
|
|
|
1224 |
- task:
|
1225 |
type: Retrieval
|
1226 |
dataset:
|
|
|
1227 |
name: MTEB FiQA2018
|
1228 |
+
type: fiqa
|
1229 |
config: default
|
1230 |
split: test
|
1231 |
revision: None
|
|
|
1293 |
- task:
|
1294 |
type: Retrieval
|
1295 |
dataset:
|
|
|
1296 |
name: MTEB HotpotQA
|
1297 |
+
type: hotpotqa
|
1298 |
config: default
|
1299 |
split: test
|
1300 |
revision: None
|
|
|
1362 |
- task:
|
1363 |
type: Classification
|
1364 |
dataset:
|
|
|
1365 |
name: MTEB ImdbClassification
|
1366 |
+
type: mteb/imdb
|
1367 |
config: default
|
1368 |
split: test
|
1369 |
revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
|
|
|
1377 |
- task:
|
1378 |
type: Retrieval
|
1379 |
dataset:
|
|
|
1380 |
name: MTEB MSMARCO
|
1381 |
+
type: msmarco
|
1382 |
config: default
|
1383 |
split: dev
|
1384 |
revision: None
|
|
|
1446 |
- task:
|
1447 |
type: Classification
|
1448 |
dataset:
|
|
|
1449 |
name: MTEB MTOPDomainClassification (en)
|
1450 |
+
type: mteb/mtop_domain
|
1451 |
config: en
|
1452 |
split: test
|
1453 |
revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
|
|
|
1459 |
- task:
|
1460 |
type: Classification
|
1461 |
dataset:
|
|
|
1462 |
name: MTEB MTOPIntentClassification (en)
|
1463 |
+
type: mteb/mtop_intent
|
1464 |
config: en
|
1465 |
split: test
|
1466 |
revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
|
|
|
1472 |
- task:
|
1473 |
type: Classification
|
1474 |
dataset:
|
|
|
1475 |
name: MTEB MassiveIntentClassification (en)
|
1476 |
+
type: mteb/amazon_massive_intent
|
1477 |
config: en
|
1478 |
split: test
|
1479 |
revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
|
|
|
1485 |
- task:
|
1486 |
type: Classification
|
1487 |
dataset:
|
|
|
1488 |
name: MTEB MassiveScenarioClassification (en)
|
1489 |
+
type: mteb/amazon_massive_scenario
|
1490 |
config: en
|
1491 |
split: test
|
1492 |
revision: 7d571f92784cd94a019292a1f45445077d0ef634
|
|
|
1498 |
- task:
|
1499 |
type: Clustering
|
1500 |
dataset:
|
|
|
1501 |
name: MTEB MedrxivClusteringP2P
|
1502 |
+
type: mteb/medrxiv-clustering-p2p
|
1503 |
config: default
|
1504 |
split: test
|
1505 |
revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
|
|
|
1509 |
- task:
|
1510 |
type: Clustering
|
1511 |
dataset:
|
|
|
1512 |
name: MTEB MedrxivClusteringS2S
|
1513 |
+
type: mteb/medrxiv-clustering-s2s
|
1514 |
config: default
|
1515 |
split: test
|
1516 |
revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
|
|
|
1520 |
- task:
|
1521 |
type: Reranking
|
1522 |
dataset:
|
|
|
1523 |
name: MTEB MindSmallReranking
|
1524 |
+
type: mteb/mind_small
|
1525 |
config: default
|
1526 |
split: test
|
1527 |
revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69
|
|
|
1533 |
- task:
|
1534 |
type: Retrieval
|
1535 |
dataset:
|
|
|
1536 |
name: MTEB NFCorpus
|
1537 |
+
type: nfcorpus
|
1538 |
config: default
|
1539 |
split: test
|
1540 |
revision: None
|
|
|
1602 |
- task:
|
1603 |
type: Retrieval
|
1604 |
dataset:
|
|
|
1605 |
name: MTEB NQ
|
1606 |
+
type: nq
|
1607 |
config: default
|
1608 |
split: test
|
1609 |
revision: None
|
|
|
1671 |
- task:
|
1672 |
type: Retrieval
|
1673 |
dataset:
|
|
|
1674 |
name: MTEB QuoraRetrieval
|
1675 |
+
type: quora
|
1676 |
config: default
|
1677 |
split: test
|
1678 |
revision: None
|
|
|
1740 |
- task:
|
1741 |
type: Clustering
|
1742 |
dataset:
|
|
|
1743 |
name: MTEB RedditClustering
|
1744 |
+
type: mteb/reddit-clustering
|
1745 |
config: default
|
1746 |
split: test
|
1747 |
revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
|
|
|
1751 |
- task:
|
1752 |
type: Clustering
|
1753 |
dataset:
|
|
|
1754 |
name: MTEB RedditClusteringP2P
|
1755 |
+
type: mteb/reddit-clustering-p2p
|
1756 |
config: default
|
1757 |
split: test
|
1758 |
revision: 282350215ef01743dc01b456c7f5241fa8937f16
|
|
|
1762 |
- task:
|
1763 |
type: Retrieval
|
1764 |
dataset:
|
|
|
1765 |
name: MTEB SCIDOCS
|
1766 |
+
type: scidocs
|
1767 |
config: default
|
1768 |
split: test
|
1769 |
revision: None
|
|
|
1831 |
- task:
|
1832 |
type: STS
|
1833 |
dataset:
|
|
|
1834 |
name: MTEB SICK-R
|
1835 |
+
type: mteb/sickr-sts
|
1836 |
config: default
|
1837 |
split: test
|
1838 |
revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
|
|
|
1852 |
- task:
|
1853 |
type: STS
|
1854 |
dataset:
|
|
|
1855 |
name: MTEB STS12
|
1856 |
+
type: mteb/sts12-sts
|
1857 |
config: default
|
1858 |
split: test
|
1859 |
revision: a0d554a64d88156834ff5ae9920b964011b16384
|
|
|
1873 |
- task:
|
1874 |
type: STS
|
1875 |
dataset:
|
|
|
1876 |
name: MTEB STS13
|
1877 |
+
type: mteb/sts13-sts
|
1878 |
config: default
|
1879 |
split: test
|
1880 |
revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
|
|
|
1894 |
- task:
|
1895 |
type: STS
|
1896 |
dataset:
|
|
|
1897 |
name: MTEB STS14
|
1898 |
+
type: mteb/sts14-sts
|
1899 |
config: default
|
1900 |
split: test
|
1901 |
revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
|
|
|
1915 |
- task:
|
1916 |
type: STS
|
1917 |
dataset:
|
|
|
1918 |
name: MTEB STS15
|
1919 |
+
type: mteb/sts15-sts
|
1920 |
config: default
|
1921 |
split: test
|
1922 |
revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
|
|
|
1936 |
- task:
|
1937 |
type: STS
|
1938 |
dataset:
|
|
|
1939 |
name: MTEB STS16
|
1940 |
+
type: mteb/sts16-sts
|
1941 |
config: default
|
1942 |
split: test
|
1943 |
revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
|
|
|
1957 |
- task:
|
1958 |
type: STS
|
1959 |
dataset:
|
|
|
1960 |
name: MTEB STS17 (en-en)
|
1961 |
+
type: mteb/sts17-crosslingual-sts
|
1962 |
config: en-en
|
1963 |
split: test
|
1964 |
revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
|
|
|
1978 |
- task:
|
1979 |
type: STS
|
1980 |
dataset:
|
|
|
1981 |
name: MTEB STS22 (en)
|
1982 |
+
type: mteb/sts22-crosslingual-sts
|
1983 |
config: en
|
1984 |
split: test
|
1985 |
revision: eea2b4fe26a775864c896887d910b76a8098ad3f
|
|
|
1999 |
- task:
|
2000 |
type: STS
|
2001 |
dataset:
|
|
|
2002 |
name: MTEB STSBenchmark
|
2003 |
+
type: mteb/stsbenchmark-sts
|
2004 |
config: default
|
2005 |
split: test
|
2006 |
revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
|
|
|
2020 |
- task:
|
2021 |
type: Reranking
|
2022 |
dataset:
|
|
|
2023 |
name: MTEB SciDocsRR
|
2024 |
+
type: mteb/scidocs-reranking
|
2025 |
config: default
|
2026 |
split: test
|
2027 |
revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
|
|
|
2033 |
- task:
|
2034 |
type: Retrieval
|
2035 |
dataset:
|
|
|
2036 |
name: MTEB SciFact
|
2037 |
+
type: scifact
|
2038 |
config: default
|
2039 |
split: test
|
2040 |
revision: None
|
|
|
2102 |
- task:
|
2103 |
type: PairClassification
|
2104 |
dataset:
|
|
|
2105 |
name: MTEB SprintDuplicateQuestions
|
2106 |
+
type: mteb/sprintduplicatequestions-pairclassification
|
2107 |
config: default
|
2108 |
split: test
|
2109 |
revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
|
|
|
2157 |
- task:
|
2158 |
type: Clustering
|
2159 |
dataset:
|
|
|
2160 |
name: MTEB StackExchangeClustering
|
2161 |
+
type: mteb/stackexchange-clustering
|
2162 |
config: default
|
2163 |
split: test
|
2164 |
revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
|
|
|
2168 |
- task:
|
2169 |
type: Clustering
|
2170 |
dataset:
|
|
|
2171 |
name: MTEB StackExchangeClusteringP2P
|
2172 |
+
type: mteb/stackexchange-clustering-p2p
|
2173 |
config: default
|
2174 |
split: test
|
2175 |
revision: 815ca46b2622cec33ccafc3735d572c266efdb44
|
|
|
2179 |
- task:
|
2180 |
type: Reranking
|
2181 |
dataset:
|
|
|
2182 |
name: MTEB StackOverflowDupQuestions
|
2183 |
+
type: mteb/stackoverflowdupquestions-reranking
|
2184 |
config: default
|
2185 |
split: test
|
2186 |
revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
|
|
|
2192 |
- task:
|
2193 |
type: Summarization
|
2194 |
dataset:
|
|
|
2195 |
name: MTEB SummEval
|
2196 |
+
type: mteb/summeval
|
2197 |
config: default
|
2198 |
split: test
|
2199 |
revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
|
|
|
2209 |
- task:
|
2210 |
type: Retrieval
|
2211 |
dataset:
|
|
|
2212 |
name: MTEB TRECCOVID
|
2213 |
+
type: trec-covid
|
2214 |
config: default
|
2215 |
split: test
|
2216 |
revision: None
|
|
|
2278 |
- task:
|
2279 |
type: Retrieval
|
2280 |
dataset:
|
|
|
2281 |
name: MTEB Touche2020
|
2282 |
+
type: webis-touche2020
|
2283 |
config: default
|
2284 |
split: test
|
2285 |
revision: None
|
|
|
2347 |
- task:
|
2348 |
type: Classification
|
2349 |
dataset:
|
|
|
2350 |
name: MTEB ToxicConversationsClassification
|
2351 |
+
type: mteb/toxic_conversations_50k
|
2352 |
config: default
|
2353 |
split: test
|
2354 |
revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c
|
|
|
2362 |
- task:
|
2363 |
type: Classification
|
2364 |
dataset:
|
|
|
2365 |
name: MTEB TweetSentimentExtractionClassification
|
2366 |
+
type: mteb/tweet_sentiment_extraction
|
2367 |
config: default
|
2368 |
split: test
|
2369 |
revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
|
|
|
2375 |
- task:
|
2376 |
type: Clustering
|
2377 |
dataset:
|
|
|
2378 |
name: MTEB TwentyNewsgroupsClustering
|
2379 |
+
type: mteb/twentynewsgroups-clustering
|
2380 |
config: default
|
2381 |
split: test
|
2382 |
revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
|
|
|
2386 |
- task:
|
2387 |
type: PairClassification
|
2388 |
dataset:
|
|
|
2389 |
name: MTEB TwitterSemEval2015
|
2390 |
+
type: mteb/twittersemeval2015-pairclassification
|
2391 |
config: default
|
2392 |
split: test
|
2393 |
revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
|
|
|
2441 |
- task:
|
2442 |
type: PairClassification
|
2443 |
dataset:
|
|
|
2444 |
name: MTEB TwitterURLCorpus
|
2445 |
+
type: mteb/twitterurlcorpus-pairclassification
|
2446 |
config: default
|
2447 |
split: test
|
2448 |
revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
|
|
|
2493 |
value: 87.47814292587448
|
2494 |
- type: max_f1
|
2495 |
value: 80.15461150280949
|
2496 |
+
- task:
|
2497 |
+
type: text-generation
|
2498 |
+
name: Text Generation
|
2499 |
+
dataset:
|
2500 |
+
name: AI2 Reasoning Challenge (25-Shot)
|
2501 |
+
type: ai2_arc
|
2502 |
+
config: ARC-Challenge
|
2503 |
+
split: test
|
2504 |
+
args:
|
2505 |
+
num_few_shot: 25
|
2506 |
+
metrics:
|
2507 |
+
- type: acc_norm
|
2508 |
+
value: 58.11
|
2509 |
+
name: normalized accuracy
|
2510 |
+
source:
|
2511 |
+
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
|
2512 |
+
name: Open LLM Leaderboard
|
2513 |
+
- task:
|
2514 |
+
type: text-generation
|
2515 |
+
name: Text Generation
|
2516 |
+
dataset:
|
2517 |
+
name: HellaSwag (10-Shot)
|
2518 |
+
type: hellaswag
|
2519 |
+
split: validation
|
2520 |
+
args:
|
2521 |
+
num_few_shot: 10
|
2522 |
+
metrics:
|
2523 |
+
- type: acc_norm
|
2524 |
+
value: 80.97
|
2525 |
+
name: normalized accuracy
|
2526 |
+
source:
|
2527 |
+
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
|
2528 |
+
name: Open LLM Leaderboard
|
2529 |
+
- task:
|
2530 |
+
type: text-generation
|
2531 |
+
name: Text Generation
|
2532 |
+
dataset:
|
2533 |
+
name: MMLU (5-Shot)
|
2534 |
+
type: cais/mmlu
|
2535 |
+
config: all
|
2536 |
+
split: test
|
2537 |
+
args:
|
2538 |
+
num_few_shot: 5
|
2539 |
+
metrics:
|
2540 |
+
- type: acc
|
2541 |
+
value: 60.29
|
2542 |
+
name: accuracy
|
2543 |
+
source:
|
2544 |
+
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
|
2545 |
+
name: Open LLM Leaderboard
|
2546 |
+
- task:
|
2547 |
+
type: text-generation
|
2548 |
+
name: Text Generation
|
2549 |
+
dataset:
|
2550 |
+
name: TruthfulQA (0-shot)
|
2551 |
+
type: truthful_qa
|
2552 |
+
config: multiple_choice
|
2553 |
+
split: validation
|
2554 |
+
args:
|
2555 |
+
num_few_shot: 0
|
2556 |
+
metrics:
|
2557 |
+
- type: mc2
|
2558 |
+
value: 45.86
|
2559 |
+
source:
|
2560 |
+
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
|
2561 |
+
name: Open LLM Leaderboard
|
2562 |
+
- task:
|
2563 |
+
type: text-generation
|
2564 |
+
name: Text Generation
|
2565 |
+
dataset:
|
2566 |
+
name: Winogrande (5-shot)
|
2567 |
+
type: winogrande
|
2568 |
+
config: winogrande_xl
|
2569 |
+
split: validation
|
2570 |
+
args:
|
2571 |
+
num_few_shot: 5
|
2572 |
+
metrics:
|
2573 |
+
- type: acc
|
2574 |
+
value: 78.22
|
2575 |
+
name: accuracy
|
2576 |
+
source:
|
2577 |
+
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
|
2578 |
+
name: Open LLM Leaderboard
|
2579 |
+
- task:
|
2580 |
+
type: text-generation
|
2581 |
+
name: Text Generation
|
2582 |
+
dataset:
|
2583 |
+
name: GSM8k (5-shot)
|
2584 |
+
type: gsm8k
|
2585 |
+
config: main
|
2586 |
+
split: test
|
2587 |
+
args:
|
2588 |
+
num_few_shot: 5
|
2589 |
+
metrics:
|
2590 |
+
- type: acc
|
2591 |
+
value: 45.03
|
2592 |
+
name: accuracy
|
2593 |
+
source:
|
2594 |
+
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
|
2595 |
+
name: Open LLM Leaderboard
|
2596 |
---
|
2597 |
|
2598 |
# Model Summary
|
|
|
2624 |
archivePrefix={arXiv},
|
2625 |
primaryClass={cs.CL}
|
2626 |
}
|
2627 |
+
```
|
2628 |
+
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
|
2629 |
+
Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_GritLM__GritLM-7B)
|
2630 |
+
|
2631 |
+
| Metric |Value|
|
2632 |
+
|---------------------------------|----:|
|
2633 |
+
|Avg. |61.41|
|
2634 |
+
|AI2 Reasoning Challenge (25-Shot)|58.11|
|
2635 |
+
|HellaSwag (10-Shot) |80.97|
|
2636 |
+
|MMLU (5-Shot) |60.29|
|
2637 |
+
|TruthfulQA (0-shot) |45.86|
|
2638 |
+
|Winogrande (5-shot) |78.22|
|
2639 |
+
|GSM8k (5-shot) |45.03|
|
2640 |
+
|