leaderboard-pr-bot commited on
Commit
0fc115c
1 Parent(s): 13f00a0

Adding Evaluation Results

Browse files

This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr

The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.

If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions

Files changed (1) hide show
  1. README.md +174 -169
README.md CHANGED
@@ -1,19 +1,19 @@
1
  ---
2
- pipeline_tag: text-generation
3
- inference: true
4
  license: apache-2.0
5
- datasets:
6
- - GritLM/tulu2
7
  tags:
8
  - mteb
 
 
 
 
9
  model-index:
10
  - name: GritLM-7B
11
  results:
12
  - task:
13
  type: Classification
14
  dataset:
15
- type: mteb/amazon_counterfactual
16
  name: MTEB AmazonCounterfactualClassification (en)
 
17
  config: en
18
  split: test
19
  revision: e8379541af4e31359cca9fbcf4b00f2671dba205
@@ -27,8 +27,8 @@ model-index:
27
  - task:
28
  type: Classification
29
  dataset:
30
- type: mteb/amazon_polarity
31
  name: MTEB AmazonPolarityClassification
 
32
  config: default
33
  split: test
34
  revision: e2d317d38cd51312af73b3d32a06d1a08b442046
@@ -42,8 +42,8 @@ model-index:
42
  - task:
43
  type: Classification
44
  dataset:
45
- type: mteb/amazon_reviews_multi
46
  name: MTEB AmazonReviewsClassification (en)
 
47
  config: en
48
  split: test
49
  revision: 1399c76144fd37290681b995c656ef9b2e06e26d
@@ -55,8 +55,8 @@ model-index:
55
  - task:
56
  type: Retrieval
57
  dataset:
58
- type: arguana
59
  name: MTEB ArguAna
 
60
  config: default
61
  split: test
62
  revision: None
@@ -124,8 +124,8 @@ model-index:
124
  - task:
125
  type: Clustering
126
  dataset:
127
- type: mteb/arxiv-clustering-p2p
128
  name: MTEB ArxivClusteringP2P
 
129
  config: default
130
  split: test
131
  revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
@@ -135,8 +135,8 @@ model-index:
135
  - task:
136
  type: Clustering
137
  dataset:
138
- type: mteb/arxiv-clustering-s2s
139
  name: MTEB ArxivClusteringS2S
 
140
  config: default
141
  split: test
142
  revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
@@ -146,8 +146,8 @@ model-index:
146
  - task:
147
  type: Reranking
148
  dataset:
149
- type: mteb/askubuntudupquestions-reranking
150
  name: MTEB AskUbuntuDupQuestions
 
151
  config: default
152
  split: test
153
  revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
@@ -159,8 +159,8 @@ model-index:
159
  - task:
160
  type: STS
161
  dataset:
162
- type: mteb/biosses-sts
163
  name: MTEB BIOSSES
 
164
  config: default
165
  split: test
166
  revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
@@ -180,8 +180,8 @@ model-index:
180
  - task:
181
  type: Classification
182
  dataset:
183
- type: mteb/banking77
184
  name: MTEB Banking77Classification
 
185
  config: default
186
  split: test
187
  revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
@@ -193,8 +193,8 @@ model-index:
193
  - task:
194
  type: Clustering
195
  dataset:
196
- type: mteb/biorxiv-clustering-p2p
197
  name: MTEB BiorxivClusteringP2P
 
198
  config: default
199
  split: test
200
  revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
@@ -204,8 +204,8 @@ model-index:
204
  - task:
205
  type: Clustering
206
  dataset:
207
- type: mteb/biorxiv-clustering-s2s
208
  name: MTEB BiorxivClusteringS2S
 
209
  config: default
210
  split: test
211
  revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
@@ -215,8 +215,8 @@ model-index:
215
  - task:
216
  type: Retrieval
217
  dataset:
218
- type: BeIR/cqadupstack
219
  name: MTEB CQADupstackAndroidRetrieval
 
220
  config: default
221
  split: test
222
  revision: None
@@ -281,15 +281,6 @@ model-index:
281
  value: 56.58
282
  - type: recall_at_5
283
  value: 63.125
284
- - task:
285
- type: Retrieval
286
- dataset:
287
- type: BeIR/cqadupstack
288
- name: MTEB CQADupstackEnglishRetrieval
289
- config: default
290
- split: test
291
- revision: None
292
- metrics:
293
  - type: map_at_1
294
  value: 38.025999999999996
295
  - type: map_at_10
@@ -350,15 +341,6 @@ model-index:
350
  value: 54.493
351
  - type: recall_at_5
352
  value: 59.64699999999999
353
- - task:
354
- type: Retrieval
355
- dataset:
356
- type: BeIR/cqadupstack
357
- name: MTEB CQADupstackGamingRetrieval
358
- config: default
359
- split: test
360
- revision: None
361
- metrics:
362
  - type: map_at_1
363
  value: 47.905
364
  - type: map_at_10
@@ -419,15 +401,6 @@ model-index:
419
  value: 67.05600000000001
420
  - type: recall_at_5
421
  value: 74.261
422
- - task:
423
- type: Retrieval
424
- dataset:
425
- type: BeIR/cqadupstack
426
- name: MTEB CQADupstackGisRetrieval
427
- config: default
428
- split: test
429
- revision: None
430
- metrics:
431
  - type: map_at_1
432
  value: 30.745
433
  - type: map_at_10
@@ -488,15 +461,6 @@ model-index:
488
  value: 45.378
489
  - type: recall_at_5
490
  value: 53.580000000000005
491
- - task:
492
- type: Retrieval
493
- dataset:
494
- type: BeIR/cqadupstack
495
- name: MTEB CQADupstackMathematicaRetrieval
496
- config: default
497
- split: test
498
- revision: None
499
- metrics:
500
  - type: map_at_1
501
  value: 19.637999999999998
502
  - type: map_at_10
@@ -557,15 +521,6 @@ model-index:
557
  value: 36.384
558
  - type: recall_at_5
559
  value: 43.964
560
- - task:
561
- type: Retrieval
562
- dataset:
563
- type: BeIR/cqadupstack
564
- name: MTEB CQADupstackPhysicsRetrieval
565
- config: default
566
- split: test
567
- revision: None
568
- metrics:
569
  - type: map_at_1
570
  value: 34.884
571
  - type: map_at_10
@@ -626,15 +581,6 @@ model-index:
626
  value: 52.428
627
  - type: recall_at_5
628
  value: 60.662000000000006
629
- - task:
630
- type: Retrieval
631
- dataset:
632
- type: BeIR/cqadupstack
633
- name: MTEB CQADupstackProgrammersRetrieval
634
- config: default
635
- split: test
636
- revision: None
637
- metrics:
638
  - type: map_at_1
639
  value: 31.588
640
  - type: map_at_10
@@ -695,15 +641,6 @@ model-index:
695
  value: 47.128
696
  - type: recall_at_5
697
  value: 54.954
698
- - task:
699
- type: Retrieval
700
- dataset:
701
- type: BeIR/cqadupstack
702
- name: MTEB CQADupstackRetrieval
703
- config: default
704
- split: test
705
- revision: None
706
- metrics:
707
  - type: map_at_1
708
  value: 31.956083333333336
709
  - type: map_at_10
@@ -764,15 +701,6 @@ model-index:
764
  value: 47.52016666666666
765
  - type: recall_at_5
766
  value: 54.36066666666666
767
- - task:
768
- type: Retrieval
769
- dataset:
770
- type: BeIR/cqadupstack
771
- name: MTEB CQADupstackStatsRetrieval
772
- config: default
773
- split: test
774
- revision: None
775
- metrics:
776
  - type: map_at_1
777
  value: 28.912
778
  - type: map_at_10
@@ -833,15 +761,6 @@ model-index:
833
  value: 42.569
834
  - type: recall_at_5
835
  value: 48.719
836
- - task:
837
- type: Retrieval
838
- dataset:
839
- type: BeIR/cqadupstack
840
- name: MTEB CQADupstackTexRetrieval
841
- config: default
842
- split: test
843
- revision: None
844
- metrics:
845
  - type: map_at_1
846
  value: 22.181
847
  - type: map_at_10
@@ -902,15 +821,6 @@ model-index:
902
  value: 35.003
903
  - type: recall_at_5
904
  value: 40.876000000000005
905
- - task:
906
- type: Retrieval
907
- dataset:
908
- type: BeIR/cqadupstack
909
- name: MTEB CQADupstackUnixRetrieval
910
- config: default
911
- split: test
912
- revision: None
913
- metrics:
914
  - type: map_at_1
915
  value: 33.934999999999995
916
  - type: map_at_10
@@ -971,15 +881,6 @@ model-index:
971
  value: 47.439
972
  - type: recall_at_5
973
  value: 54.567
974
- - task:
975
- type: Retrieval
976
- dataset:
977
- type: BeIR/cqadupstack
978
- name: MTEB CQADupstackWebmastersRetrieval
979
- config: default
980
- split: test
981
- revision: None
982
- metrics:
983
  - type: map_at_1
984
  value: 32.058
985
  - type: map_at_10
@@ -1040,15 +941,6 @@ model-index:
1040
  value: 47.509
1041
  - type: recall_at_5
1042
  value: 52.455
1043
- - task:
1044
- type: Retrieval
1045
- dataset:
1046
- type: BeIR/cqadupstack
1047
- name: MTEB CQADupstackWordpressRetrieval
1048
- config: default
1049
- split: test
1050
- revision: None
1051
- metrics:
1052
  - type: map_at_1
1053
  value: 26.029000000000003
1054
  - type: map_at_10
@@ -1112,8 +1004,8 @@ model-index:
1112
  - task:
1113
  type: Retrieval
1114
  dataset:
1115
- type: climate-fever
1116
  name: MTEB ClimateFEVER
 
1117
  config: default
1118
  split: test
1119
  revision: None
@@ -1181,8 +1073,8 @@ model-index:
1181
  - task:
1182
  type: Retrieval
1183
  dataset:
1184
- type: dbpedia-entity
1185
  name: MTEB DBPedia
 
1186
  config: default
1187
  split: test
1188
  revision: None
@@ -1250,8 +1142,8 @@ model-index:
1250
  - task:
1251
  type: Classification
1252
  dataset:
1253
- type: mteb/emotion
1254
  name: MTEB EmotionClassification
 
1255
  config: default
1256
  split: test
1257
  revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
@@ -1263,8 +1155,8 @@ model-index:
1263
  - task:
1264
  type: Retrieval
1265
  dataset:
1266
- type: fever
1267
  name: MTEB FEVER
 
1268
  config: default
1269
  split: test
1270
  revision: None
@@ -1332,8 +1224,8 @@ model-index:
1332
  - task:
1333
  type: Retrieval
1334
  dataset:
1335
- type: fiqa
1336
  name: MTEB FiQA2018
 
1337
  config: default
1338
  split: test
1339
  revision: None
@@ -1401,8 +1293,8 @@ model-index:
1401
  - task:
1402
  type: Retrieval
1403
  dataset:
1404
- type: hotpotqa
1405
  name: MTEB HotpotQA
 
1406
  config: default
1407
  split: test
1408
  revision: None
@@ -1470,8 +1362,8 @@ model-index:
1470
  - task:
1471
  type: Classification
1472
  dataset:
1473
- type: mteb/imdb
1474
  name: MTEB ImdbClassification
 
1475
  config: default
1476
  split: test
1477
  revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
@@ -1485,8 +1377,8 @@ model-index:
1485
  - task:
1486
  type: Retrieval
1487
  dataset:
1488
- type: msmarco
1489
  name: MTEB MSMARCO
 
1490
  config: default
1491
  split: dev
1492
  revision: None
@@ -1554,8 +1446,8 @@ model-index:
1554
  - task:
1555
  type: Classification
1556
  dataset:
1557
- type: mteb/mtop_domain
1558
  name: MTEB MTOPDomainClassification (en)
 
1559
  config: en
1560
  split: test
1561
  revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
@@ -1567,8 +1459,8 @@ model-index:
1567
  - task:
1568
  type: Classification
1569
  dataset:
1570
- type: mteb/mtop_intent
1571
  name: MTEB MTOPIntentClassification (en)
 
1572
  config: en
1573
  split: test
1574
  revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
@@ -1580,8 +1472,8 @@ model-index:
1580
  - task:
1581
  type: Classification
1582
  dataset:
1583
- type: mteb/amazon_massive_intent
1584
  name: MTEB MassiveIntentClassification (en)
 
1585
  config: en
1586
  split: test
1587
  revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
@@ -1593,8 +1485,8 @@ model-index:
1593
  - task:
1594
  type: Classification
1595
  dataset:
1596
- type: mteb/amazon_massive_scenario
1597
  name: MTEB MassiveScenarioClassification (en)
 
1598
  config: en
1599
  split: test
1600
  revision: 7d571f92784cd94a019292a1f45445077d0ef634
@@ -1606,8 +1498,8 @@ model-index:
1606
  - task:
1607
  type: Clustering
1608
  dataset:
1609
- type: mteb/medrxiv-clustering-p2p
1610
  name: MTEB MedrxivClusteringP2P
 
1611
  config: default
1612
  split: test
1613
  revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
@@ -1617,8 +1509,8 @@ model-index:
1617
  - task:
1618
  type: Clustering
1619
  dataset:
1620
- type: mteb/medrxiv-clustering-s2s
1621
  name: MTEB MedrxivClusteringS2S
 
1622
  config: default
1623
  split: test
1624
  revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
@@ -1628,8 +1520,8 @@ model-index:
1628
  - task:
1629
  type: Reranking
1630
  dataset:
1631
- type: mteb/mind_small
1632
  name: MTEB MindSmallReranking
 
1633
  config: default
1634
  split: test
1635
  revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69
@@ -1641,8 +1533,8 @@ model-index:
1641
  - task:
1642
  type: Retrieval
1643
  dataset:
1644
- type: nfcorpus
1645
  name: MTEB NFCorpus
 
1646
  config: default
1647
  split: test
1648
  revision: None
@@ -1710,8 +1602,8 @@ model-index:
1710
  - task:
1711
  type: Retrieval
1712
  dataset:
1713
- type: nq
1714
  name: MTEB NQ
 
1715
  config: default
1716
  split: test
1717
  revision: None
@@ -1779,8 +1671,8 @@ model-index:
1779
  - task:
1780
  type: Retrieval
1781
  dataset:
1782
- type: quora
1783
  name: MTEB QuoraRetrieval
 
1784
  config: default
1785
  split: test
1786
  revision: None
@@ -1848,8 +1740,8 @@ model-index:
1848
  - task:
1849
  type: Clustering
1850
  dataset:
1851
- type: mteb/reddit-clustering
1852
  name: MTEB RedditClustering
 
1853
  config: default
1854
  split: test
1855
  revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
@@ -1859,8 +1751,8 @@ model-index:
1859
  - task:
1860
  type: Clustering
1861
  dataset:
1862
- type: mteb/reddit-clustering-p2p
1863
  name: MTEB RedditClusteringP2P
 
1864
  config: default
1865
  split: test
1866
  revision: 282350215ef01743dc01b456c7f5241fa8937f16
@@ -1870,8 +1762,8 @@ model-index:
1870
  - task:
1871
  type: Retrieval
1872
  dataset:
1873
- type: scidocs
1874
  name: MTEB SCIDOCS
 
1875
  config: default
1876
  split: test
1877
  revision: None
@@ -1939,8 +1831,8 @@ model-index:
1939
  - task:
1940
  type: STS
1941
  dataset:
1942
- type: mteb/sickr-sts
1943
  name: MTEB SICK-R
 
1944
  config: default
1945
  split: test
1946
  revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
@@ -1960,8 +1852,8 @@ model-index:
1960
  - task:
1961
  type: STS
1962
  dataset:
1963
- type: mteb/sts12-sts
1964
  name: MTEB STS12
 
1965
  config: default
1966
  split: test
1967
  revision: a0d554a64d88156834ff5ae9920b964011b16384
@@ -1981,8 +1873,8 @@ model-index:
1981
  - task:
1982
  type: STS
1983
  dataset:
1984
- type: mteb/sts13-sts
1985
  name: MTEB STS13
 
1986
  config: default
1987
  split: test
1988
  revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
@@ -2002,8 +1894,8 @@ model-index:
2002
  - task:
2003
  type: STS
2004
  dataset:
2005
- type: mteb/sts14-sts
2006
  name: MTEB STS14
 
2007
  config: default
2008
  split: test
2009
  revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
@@ -2023,8 +1915,8 @@ model-index:
2023
  - task:
2024
  type: STS
2025
  dataset:
2026
- type: mteb/sts15-sts
2027
  name: MTEB STS15
 
2028
  config: default
2029
  split: test
2030
  revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
@@ -2044,8 +1936,8 @@ model-index:
2044
  - task:
2045
  type: STS
2046
  dataset:
2047
- type: mteb/sts16-sts
2048
  name: MTEB STS16
 
2049
  config: default
2050
  split: test
2051
  revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
@@ -2065,8 +1957,8 @@ model-index:
2065
  - task:
2066
  type: STS
2067
  dataset:
2068
- type: mteb/sts17-crosslingual-sts
2069
  name: MTEB STS17 (en-en)
 
2070
  config: en-en
2071
  split: test
2072
  revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
@@ -2086,8 +1978,8 @@ model-index:
2086
  - task:
2087
  type: STS
2088
  dataset:
2089
- type: mteb/sts22-crosslingual-sts
2090
  name: MTEB STS22 (en)
 
2091
  config: en
2092
  split: test
2093
  revision: eea2b4fe26a775864c896887d910b76a8098ad3f
@@ -2107,8 +1999,8 @@ model-index:
2107
  - task:
2108
  type: STS
2109
  dataset:
2110
- type: mteb/stsbenchmark-sts
2111
  name: MTEB STSBenchmark
 
2112
  config: default
2113
  split: test
2114
  revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
@@ -2128,8 +2020,8 @@ model-index:
2128
  - task:
2129
  type: Reranking
2130
  dataset:
2131
- type: mteb/scidocs-reranking
2132
  name: MTEB SciDocsRR
 
2133
  config: default
2134
  split: test
2135
  revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
@@ -2141,8 +2033,8 @@ model-index:
2141
  - task:
2142
  type: Retrieval
2143
  dataset:
2144
- type: scifact
2145
  name: MTEB SciFact
 
2146
  config: default
2147
  split: test
2148
  revision: None
@@ -2210,8 +2102,8 @@ model-index:
2210
  - task:
2211
  type: PairClassification
2212
  dataset:
2213
- type: mteb/sprintduplicatequestions-pairclassification
2214
  name: MTEB SprintDuplicateQuestions
 
2215
  config: default
2216
  split: test
2217
  revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
@@ -2265,8 +2157,8 @@ model-index:
2265
  - task:
2266
  type: Clustering
2267
  dataset:
2268
- type: mteb/stackexchange-clustering
2269
  name: MTEB StackExchangeClustering
 
2270
  config: default
2271
  split: test
2272
  revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
@@ -2276,8 +2168,8 @@ model-index:
2276
  - task:
2277
  type: Clustering
2278
  dataset:
2279
- type: mteb/stackexchange-clustering-p2p
2280
  name: MTEB StackExchangeClusteringP2P
 
2281
  config: default
2282
  split: test
2283
  revision: 815ca46b2622cec33ccafc3735d572c266efdb44
@@ -2287,8 +2179,8 @@ model-index:
2287
  - task:
2288
  type: Reranking
2289
  dataset:
2290
- type: mteb/stackoverflowdupquestions-reranking
2291
  name: MTEB StackOverflowDupQuestions
 
2292
  config: default
2293
  split: test
2294
  revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
@@ -2300,8 +2192,8 @@ model-index:
2300
  - task:
2301
  type: Summarization
2302
  dataset:
2303
- type: mteb/summeval
2304
  name: MTEB SummEval
 
2305
  config: default
2306
  split: test
2307
  revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
@@ -2317,8 +2209,8 @@ model-index:
2317
  - task:
2318
  type: Retrieval
2319
  dataset:
2320
- type: trec-covid
2321
  name: MTEB TRECCOVID
 
2322
  config: default
2323
  split: test
2324
  revision: None
@@ -2386,8 +2278,8 @@ model-index:
2386
  - task:
2387
  type: Retrieval
2388
  dataset:
2389
- type: webis-touche2020
2390
  name: MTEB Touche2020
 
2391
  config: default
2392
  split: test
2393
  revision: None
@@ -2455,8 +2347,8 @@ model-index:
2455
  - task:
2456
  type: Classification
2457
  dataset:
2458
- type: mteb/toxic_conversations_50k
2459
  name: MTEB ToxicConversationsClassification
 
2460
  config: default
2461
  split: test
2462
  revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c
@@ -2470,8 +2362,8 @@ model-index:
2470
  - task:
2471
  type: Classification
2472
  dataset:
2473
- type: mteb/tweet_sentiment_extraction
2474
  name: MTEB TweetSentimentExtractionClassification
 
2475
  config: default
2476
  split: test
2477
  revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
@@ -2483,8 +2375,8 @@ model-index:
2483
  - task:
2484
  type: Clustering
2485
  dataset:
2486
- type: mteb/twentynewsgroups-clustering
2487
  name: MTEB TwentyNewsgroupsClustering
 
2488
  config: default
2489
  split: test
2490
  revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
@@ -2494,8 +2386,8 @@ model-index:
2494
  - task:
2495
  type: PairClassification
2496
  dataset:
2497
- type: mteb/twittersemeval2015-pairclassification
2498
  name: MTEB TwitterSemEval2015
 
2499
  config: default
2500
  split: test
2501
  revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
@@ -2549,8 +2441,8 @@ model-index:
2549
  - task:
2550
  type: PairClassification
2551
  dataset:
2552
- type: mteb/twitterurlcorpus-pairclassification
2553
  name: MTEB TwitterURLCorpus
 
2554
  config: default
2555
  split: test
2556
  revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
@@ -2601,6 +2493,106 @@ model-index:
2601
  value: 87.47814292587448
2602
  - type: max_f1
2603
  value: 80.15461150280949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2604
  ---
2605
 
2606
  # Model Summary
@@ -2632,4 +2624,17 @@ The model usage is documented [here](https://github.com/ContextualAI/gritlm?tab=
2632
  archivePrefix={arXiv},
2633
  primaryClass={cs.CL}
2634
  }
2635
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
 
 
2
  license: apache-2.0
 
 
3
  tags:
4
  - mteb
5
+ datasets:
6
+ - GritLM/tulu2
7
+ pipeline_tag: text-generation
8
+ inference: true
9
  model-index:
10
  - name: GritLM-7B
11
  results:
12
  - task:
13
  type: Classification
14
  dataset:
 
15
  name: MTEB AmazonCounterfactualClassification (en)
16
+ type: mteb/amazon_counterfactual
17
  config: en
18
  split: test
19
  revision: e8379541af4e31359cca9fbcf4b00f2671dba205
 
27
  - task:
28
  type: Classification
29
  dataset:
 
30
  name: MTEB AmazonPolarityClassification
31
+ type: mteb/amazon_polarity
32
  config: default
33
  split: test
34
  revision: e2d317d38cd51312af73b3d32a06d1a08b442046
 
42
  - task:
43
  type: Classification
44
  dataset:
 
45
  name: MTEB AmazonReviewsClassification (en)
46
+ type: mteb/amazon_reviews_multi
47
  config: en
48
  split: test
49
  revision: 1399c76144fd37290681b995c656ef9b2e06e26d
 
55
  - task:
56
  type: Retrieval
57
  dataset:
 
58
  name: MTEB ArguAna
59
+ type: arguana
60
  config: default
61
  split: test
62
  revision: None
 
124
  - task:
125
  type: Clustering
126
  dataset:
 
127
  name: MTEB ArxivClusteringP2P
128
+ type: mteb/arxiv-clustering-p2p
129
  config: default
130
  split: test
131
  revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d
 
135
  - task:
136
  type: Clustering
137
  dataset:
 
138
  name: MTEB ArxivClusteringS2S
139
+ type: mteb/arxiv-clustering-s2s
140
  config: default
141
  split: test
142
  revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53
 
146
  - task:
147
  type: Reranking
148
  dataset:
 
149
  name: MTEB AskUbuntuDupQuestions
150
+ type: mteb/askubuntudupquestions-reranking
151
  config: default
152
  split: test
153
  revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
 
159
  - task:
160
  type: STS
161
  dataset:
 
162
  name: MTEB BIOSSES
163
+ type: mteb/biosses-sts
164
  config: default
165
  split: test
166
  revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
 
180
  - task:
181
  type: Classification
182
  dataset:
 
183
  name: MTEB Banking77Classification
184
+ type: mteb/banking77
185
  config: default
186
  split: test
187
  revision: 0fd18e25b25c072e09e0d92ab615fda904d66300
 
193
  - task:
194
  type: Clustering
195
  dataset:
 
196
  name: MTEB BiorxivClusteringP2P
197
+ type: mteb/biorxiv-clustering-p2p
198
  config: default
199
  split: test
200
  revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40
 
204
  - task:
205
  type: Clustering
206
  dataset:
 
207
  name: MTEB BiorxivClusteringS2S
208
+ type: mteb/biorxiv-clustering-s2s
209
  config: default
210
  split: test
211
  revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908
 
215
  - task:
216
  type: Retrieval
217
  dataset:
 
218
  name: MTEB CQADupstackAndroidRetrieval
219
+ type: BeIR/cqadupstack
220
  config: default
221
  split: test
222
  revision: None
 
281
  value: 56.58
282
  - type: recall_at_5
283
  value: 63.125
 
 
 
 
 
 
 
 
 
284
  - type: map_at_1
285
  value: 38.025999999999996
286
  - type: map_at_10
 
341
  value: 54.493
342
  - type: recall_at_5
343
  value: 59.64699999999999
 
 
 
 
 
 
 
 
 
344
  - type: map_at_1
345
  value: 47.905
346
  - type: map_at_10
 
401
  value: 67.05600000000001
402
  - type: recall_at_5
403
  value: 74.261
 
 
 
 
 
 
 
 
 
404
  - type: map_at_1
405
  value: 30.745
406
  - type: map_at_10
 
461
  value: 45.378
462
  - type: recall_at_5
463
  value: 53.580000000000005
 
 
 
 
 
 
 
 
 
464
  - type: map_at_1
465
  value: 19.637999999999998
466
  - type: map_at_10
 
521
  value: 36.384
522
  - type: recall_at_5
523
  value: 43.964
 
 
 
 
 
 
 
 
 
524
  - type: map_at_1
525
  value: 34.884
526
  - type: map_at_10
 
581
  value: 52.428
582
  - type: recall_at_5
583
  value: 60.662000000000006
 
 
 
 
 
 
 
 
 
584
  - type: map_at_1
585
  value: 31.588
586
  - type: map_at_10
 
641
  value: 47.128
642
  - type: recall_at_5
643
  value: 54.954
 
 
 
 
 
 
 
 
 
644
  - type: map_at_1
645
  value: 31.956083333333336
646
  - type: map_at_10
 
701
  value: 47.52016666666666
702
  - type: recall_at_5
703
  value: 54.36066666666666
 
 
 
 
 
 
 
 
 
704
  - type: map_at_1
705
  value: 28.912
706
  - type: map_at_10
 
761
  value: 42.569
762
  - type: recall_at_5
763
  value: 48.719
 
 
 
 
 
 
 
 
 
764
  - type: map_at_1
765
  value: 22.181
766
  - type: map_at_10
 
821
  value: 35.003
822
  - type: recall_at_5
823
  value: 40.876000000000005
 
 
 
 
 
 
 
 
 
824
  - type: map_at_1
825
  value: 33.934999999999995
826
  - type: map_at_10
 
881
  value: 47.439
882
  - type: recall_at_5
883
  value: 54.567
 
 
 
 
 
 
 
 
 
884
  - type: map_at_1
885
  value: 32.058
886
  - type: map_at_10
 
941
  value: 47.509
942
  - type: recall_at_5
943
  value: 52.455
 
 
 
 
 
 
 
 
 
944
  - type: map_at_1
945
  value: 26.029000000000003
946
  - type: map_at_10
 
1004
  - task:
1005
  type: Retrieval
1006
  dataset:
 
1007
  name: MTEB ClimateFEVER
1008
+ type: climate-fever
1009
  config: default
1010
  split: test
1011
  revision: None
 
1073
  - task:
1074
  type: Retrieval
1075
  dataset:
 
1076
  name: MTEB DBPedia
1077
+ type: dbpedia-entity
1078
  config: default
1079
  split: test
1080
  revision: None
 
1142
  - task:
1143
  type: Classification
1144
  dataset:
 
1145
  name: MTEB EmotionClassification
1146
+ type: mteb/emotion
1147
  config: default
1148
  split: test
1149
  revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37
 
1155
  - task:
1156
  type: Retrieval
1157
  dataset:
 
1158
  name: MTEB FEVER
1159
+ type: fever
1160
  config: default
1161
  split: test
1162
  revision: None
 
1224
  - task:
1225
  type: Retrieval
1226
  dataset:
 
1227
  name: MTEB FiQA2018
1228
+ type: fiqa
1229
  config: default
1230
  split: test
1231
  revision: None
 
1293
  - task:
1294
  type: Retrieval
1295
  dataset:
 
1296
  name: MTEB HotpotQA
1297
+ type: hotpotqa
1298
  config: default
1299
  split: test
1300
  revision: None
 
1362
  - task:
1363
  type: Classification
1364
  dataset:
 
1365
  name: MTEB ImdbClassification
1366
+ type: mteb/imdb
1367
  config: default
1368
  split: test
1369
  revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7
 
1377
  - task:
1378
  type: Retrieval
1379
  dataset:
 
1380
  name: MTEB MSMARCO
1381
+ type: msmarco
1382
  config: default
1383
  split: dev
1384
  revision: None
 
1446
  - task:
1447
  type: Classification
1448
  dataset:
 
1449
  name: MTEB MTOPDomainClassification (en)
1450
+ type: mteb/mtop_domain
1451
  config: en
1452
  split: test
1453
  revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
 
1459
  - task:
1460
  type: Classification
1461
  dataset:
 
1462
  name: MTEB MTOPIntentClassification (en)
1463
+ type: mteb/mtop_intent
1464
  config: en
1465
  split: test
1466
  revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
 
1472
  - task:
1473
  type: Classification
1474
  dataset:
 
1475
  name: MTEB MassiveIntentClassification (en)
1476
+ type: mteb/amazon_massive_intent
1477
  config: en
1478
  split: test
1479
  revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
 
1485
  - task:
1486
  type: Classification
1487
  dataset:
 
1488
  name: MTEB MassiveScenarioClassification (en)
1489
+ type: mteb/amazon_massive_scenario
1490
  config: en
1491
  split: test
1492
  revision: 7d571f92784cd94a019292a1f45445077d0ef634
 
1498
  - task:
1499
  type: Clustering
1500
  dataset:
 
1501
  name: MTEB MedrxivClusteringP2P
1502
+ type: mteb/medrxiv-clustering-p2p
1503
  config: default
1504
  split: test
1505
  revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73
 
1509
  - task:
1510
  type: Clustering
1511
  dataset:
 
1512
  name: MTEB MedrxivClusteringS2S
1513
+ type: mteb/medrxiv-clustering-s2s
1514
  config: default
1515
  split: test
1516
  revision: 35191c8c0dca72d8ff3efcd72aa802307d469663
 
1520
  - task:
1521
  type: Reranking
1522
  dataset:
 
1523
  name: MTEB MindSmallReranking
1524
+ type: mteb/mind_small
1525
  config: default
1526
  split: test
1527
  revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69
 
1533
  - task:
1534
  type: Retrieval
1535
  dataset:
 
1536
  name: MTEB NFCorpus
1537
+ type: nfcorpus
1538
  config: default
1539
  split: test
1540
  revision: None
 
1602
  - task:
1603
  type: Retrieval
1604
  dataset:
 
1605
  name: MTEB NQ
1606
+ type: nq
1607
  config: default
1608
  split: test
1609
  revision: None
 
1671
  - task:
1672
  type: Retrieval
1673
  dataset:
 
1674
  name: MTEB QuoraRetrieval
1675
+ type: quora
1676
  config: default
1677
  split: test
1678
  revision: None
 
1740
  - task:
1741
  type: Clustering
1742
  dataset:
 
1743
  name: MTEB RedditClustering
1744
+ type: mteb/reddit-clustering
1745
  config: default
1746
  split: test
1747
  revision: 24640382cdbf8abc73003fb0fa6d111a705499eb
 
1751
  - task:
1752
  type: Clustering
1753
  dataset:
 
1754
  name: MTEB RedditClusteringP2P
1755
+ type: mteb/reddit-clustering-p2p
1756
  config: default
1757
  split: test
1758
  revision: 282350215ef01743dc01b456c7f5241fa8937f16
 
1762
  - task:
1763
  type: Retrieval
1764
  dataset:
 
1765
  name: MTEB SCIDOCS
1766
+ type: scidocs
1767
  config: default
1768
  split: test
1769
  revision: None
 
1831
  - task:
1832
  type: STS
1833
  dataset:
 
1834
  name: MTEB SICK-R
1835
+ type: mteb/sickr-sts
1836
  config: default
1837
  split: test
1838
  revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
 
1852
  - task:
1853
  type: STS
1854
  dataset:
 
1855
  name: MTEB STS12
1856
+ type: mteb/sts12-sts
1857
  config: default
1858
  split: test
1859
  revision: a0d554a64d88156834ff5ae9920b964011b16384
 
1873
  - task:
1874
  type: STS
1875
  dataset:
 
1876
  name: MTEB STS13
1877
+ type: mteb/sts13-sts
1878
  config: default
1879
  split: test
1880
  revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
 
1894
  - task:
1895
  type: STS
1896
  dataset:
 
1897
  name: MTEB STS14
1898
+ type: mteb/sts14-sts
1899
  config: default
1900
  split: test
1901
  revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
 
1915
  - task:
1916
  type: STS
1917
  dataset:
 
1918
  name: MTEB STS15
1919
+ type: mteb/sts15-sts
1920
  config: default
1921
  split: test
1922
  revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
 
1936
  - task:
1937
  type: STS
1938
  dataset:
 
1939
  name: MTEB STS16
1940
+ type: mteb/sts16-sts
1941
  config: default
1942
  split: test
1943
  revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
 
1957
  - task:
1958
  type: STS
1959
  dataset:
 
1960
  name: MTEB STS17 (en-en)
1961
+ type: mteb/sts17-crosslingual-sts
1962
  config: en-en
1963
  split: test
1964
  revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
 
1978
  - task:
1979
  type: STS
1980
  dataset:
 
1981
  name: MTEB STS22 (en)
1982
+ type: mteb/sts22-crosslingual-sts
1983
  config: en
1984
  split: test
1985
  revision: eea2b4fe26a775864c896887d910b76a8098ad3f
 
1999
  - task:
2000
  type: STS
2001
  dataset:
 
2002
  name: MTEB STSBenchmark
2003
+ type: mteb/stsbenchmark-sts
2004
  config: default
2005
  split: test
2006
  revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
 
2020
  - task:
2021
  type: Reranking
2022
  dataset:
 
2023
  name: MTEB SciDocsRR
2024
+ type: mteb/scidocs-reranking
2025
  config: default
2026
  split: test
2027
  revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
 
2033
  - task:
2034
  type: Retrieval
2035
  dataset:
 
2036
  name: MTEB SciFact
2037
+ type: scifact
2038
  config: default
2039
  split: test
2040
  revision: None
 
2102
  - task:
2103
  type: PairClassification
2104
  dataset:
 
2105
  name: MTEB SprintDuplicateQuestions
2106
+ type: mteb/sprintduplicatequestions-pairclassification
2107
  config: default
2108
  split: test
2109
  revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
 
2157
  - task:
2158
  type: Clustering
2159
  dataset:
 
2160
  name: MTEB StackExchangeClustering
2161
+ type: mteb/stackexchange-clustering
2162
  config: default
2163
  split: test
2164
  revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259
 
2168
  - task:
2169
  type: Clustering
2170
  dataset:
 
2171
  name: MTEB StackExchangeClusteringP2P
2172
+ type: mteb/stackexchange-clustering-p2p
2173
  config: default
2174
  split: test
2175
  revision: 815ca46b2622cec33ccafc3735d572c266efdb44
 
2179
  - task:
2180
  type: Reranking
2181
  dataset:
 
2182
  name: MTEB StackOverflowDupQuestions
2183
+ type: mteb/stackoverflowdupquestions-reranking
2184
  config: default
2185
  split: test
2186
  revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
 
2192
  - task:
2193
  type: Summarization
2194
  dataset:
 
2195
  name: MTEB SummEval
2196
+ type: mteb/summeval
2197
  config: default
2198
  split: test
2199
  revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c
 
2209
  - task:
2210
  type: Retrieval
2211
  dataset:
 
2212
  name: MTEB TRECCOVID
2213
+ type: trec-covid
2214
  config: default
2215
  split: test
2216
  revision: None
 
2278
  - task:
2279
  type: Retrieval
2280
  dataset:
 
2281
  name: MTEB Touche2020
2282
+ type: webis-touche2020
2283
  config: default
2284
  split: test
2285
  revision: None
 
2347
  - task:
2348
  type: Classification
2349
  dataset:
 
2350
  name: MTEB ToxicConversationsClassification
2351
+ type: mteb/toxic_conversations_50k
2352
  config: default
2353
  split: test
2354
  revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c
 
2362
  - task:
2363
  type: Classification
2364
  dataset:
 
2365
  name: MTEB TweetSentimentExtractionClassification
2366
+ type: mteb/tweet_sentiment_extraction
2367
  config: default
2368
  split: test
2369
  revision: d604517c81ca91fe16a244d1248fc021f9ecee7a
 
2375
  - task:
2376
  type: Clustering
2377
  dataset:
 
2378
  name: MTEB TwentyNewsgroupsClustering
2379
+ type: mteb/twentynewsgroups-clustering
2380
  config: default
2381
  split: test
2382
  revision: 6125ec4e24fa026cec8a478383ee943acfbd5449
 
2386
  - task:
2387
  type: PairClassification
2388
  dataset:
 
2389
  name: MTEB TwitterSemEval2015
2390
+ type: mteb/twittersemeval2015-pairclassification
2391
  config: default
2392
  split: test
2393
  revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
 
2441
  - task:
2442
  type: PairClassification
2443
  dataset:
 
2444
  name: MTEB TwitterURLCorpus
2445
+ type: mteb/twitterurlcorpus-pairclassification
2446
  config: default
2447
  split: test
2448
  revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
 
2493
  value: 87.47814292587448
2494
  - type: max_f1
2495
  value: 80.15461150280949
2496
+ - task:
2497
+ type: text-generation
2498
+ name: Text Generation
2499
+ dataset:
2500
+ name: AI2 Reasoning Challenge (25-Shot)
2501
+ type: ai2_arc
2502
+ config: ARC-Challenge
2503
+ split: test
2504
+ args:
2505
+ num_few_shot: 25
2506
+ metrics:
2507
+ - type: acc_norm
2508
+ value: 58.11
2509
+ name: normalized accuracy
2510
+ source:
2511
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
2512
+ name: Open LLM Leaderboard
2513
+ - task:
2514
+ type: text-generation
2515
+ name: Text Generation
2516
+ dataset:
2517
+ name: HellaSwag (10-Shot)
2518
+ type: hellaswag
2519
+ split: validation
2520
+ args:
2521
+ num_few_shot: 10
2522
+ metrics:
2523
+ - type: acc_norm
2524
+ value: 80.97
2525
+ name: normalized accuracy
2526
+ source:
2527
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
2528
+ name: Open LLM Leaderboard
2529
+ - task:
2530
+ type: text-generation
2531
+ name: Text Generation
2532
+ dataset:
2533
+ name: MMLU (5-Shot)
2534
+ type: cais/mmlu
2535
+ config: all
2536
+ split: test
2537
+ args:
2538
+ num_few_shot: 5
2539
+ metrics:
2540
+ - type: acc
2541
+ value: 60.29
2542
+ name: accuracy
2543
+ source:
2544
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
2545
+ name: Open LLM Leaderboard
2546
+ - task:
2547
+ type: text-generation
2548
+ name: Text Generation
2549
+ dataset:
2550
+ name: TruthfulQA (0-shot)
2551
+ type: truthful_qa
2552
+ config: multiple_choice
2553
+ split: validation
2554
+ args:
2555
+ num_few_shot: 0
2556
+ metrics:
2557
+ - type: mc2
2558
+ value: 45.86
2559
+ source:
2560
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
2561
+ name: Open LLM Leaderboard
2562
+ - task:
2563
+ type: text-generation
2564
+ name: Text Generation
2565
+ dataset:
2566
+ name: Winogrande (5-shot)
2567
+ type: winogrande
2568
+ config: winogrande_xl
2569
+ split: validation
2570
+ args:
2571
+ num_few_shot: 5
2572
+ metrics:
2573
+ - type: acc
2574
+ value: 78.22
2575
+ name: accuracy
2576
+ source:
2577
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
2578
+ name: Open LLM Leaderboard
2579
+ - task:
2580
+ type: text-generation
2581
+ name: Text Generation
2582
+ dataset:
2583
+ name: GSM8k (5-shot)
2584
+ type: gsm8k
2585
+ config: main
2586
+ split: test
2587
+ args:
2588
+ num_few_shot: 5
2589
+ metrics:
2590
+ - type: acc
2591
+ value: 45.03
2592
+ name: accuracy
2593
+ source:
2594
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=GritLM/GritLM-7B
2595
+ name: Open LLM Leaderboard
2596
  ---
2597
 
2598
  # Model Summary
 
2624
  archivePrefix={arXiv},
2625
  primaryClass={cs.CL}
2626
  }
2627
+ ```
2628
+ # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
2629
+ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_GritLM__GritLM-7B)
2630
+
2631
+ | Metric |Value|
2632
+ |---------------------------------|----:|
2633
+ |Avg. |61.41|
2634
+ |AI2 Reasoning Challenge (25-Shot)|58.11|
2635
+ |HellaSwag (10-Shot) |80.97|
2636
+ |MMLU (5-Shot) |60.29|
2637
+ |TruthfulQA (0-shot) |45.86|
2638
+ |Winogrande (5-shot) |78.22|
2639
+ |GSM8k (5-shot) |45.03|
2640
+