towing commited on
Commit
0d65e60
·
verified ·
1 Parent(s): 8db2588

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +1170 -1162
README.md CHANGED
@@ -1,1163 +1,1171 @@
1
- ---
2
- tags:
3
- - mteb
4
- - sentence-similarity
5
- - sentence-transformers
6
- - Sentence Transformers
7
- model-index:
8
- - name: gte-small-zh
9
- results:
10
- - task:
11
- type: STS
12
- dataset:
13
- type: C-MTEB/AFQMC
14
- name: MTEB AFQMC
15
- config: default
16
- split: validation
17
- revision: None
18
- metrics:
19
- - type: cos_sim_pearson
20
- value: 35.80906032378281
21
- - type: cos_sim_spearman
22
- value: 36.688967176174415
23
- - type: euclidean_pearson
24
- value: 35.70701955438158
25
- - type: euclidean_spearman
26
- value: 36.6889470691436
27
- - type: manhattan_pearson
28
- value: 35.832741768286944
29
- - type: manhattan_spearman
30
- value: 36.831888591957195
31
- - task:
32
- type: STS
33
- dataset:
34
- type: C-MTEB/ATEC
35
- name: MTEB ATEC
36
- config: default
37
- split: test
38
- revision: None
39
- metrics:
40
- - type: cos_sim_pearson
41
- value: 44.667266488330384
42
- - type: cos_sim_spearman
43
- value: 45.77390794946174
44
- - type: euclidean_pearson
45
- value: 48.14272832901943
46
- - type: euclidean_spearman
47
- value: 45.77390569666109
48
- - type: manhattan_pearson
49
- value: 48.187667158563094
50
- - type: manhattan_spearman
51
- value: 45.80979161966117
52
- - task:
53
- type: Classification
54
- dataset:
55
- type: mteb/amazon_reviews_multi
56
- name: MTEB AmazonReviewsClassification (zh)
57
- config: zh
58
- split: test
59
- revision: 1399c76144fd37290681b995c656ef9b2e06e26d
60
- metrics:
61
- - type: accuracy
62
- value: 38.690000000000005
63
- - type: f1
64
- value: 36.868257131984016
65
- - task:
66
- type: STS
67
- dataset:
68
- type: C-MTEB/BQ
69
- name: MTEB BQ
70
- config: default
71
- split: test
72
- revision: None
73
- metrics:
74
- - type: cos_sim_pearson
75
- value: 49.03674224607541
76
- - type: cos_sim_spearman
77
- value: 49.63568854885055
78
- - type: euclidean_pearson
79
- value: 49.47441886441355
80
- - type: euclidean_spearman
81
- value: 49.63567815431205
82
- - type: manhattan_pearson
83
- value: 49.76480072909559
84
- - type: manhattan_spearman
85
- value: 49.977789367288224
86
- - task:
87
- type: Clustering
88
- dataset:
89
- type: C-MTEB/CLSClusteringP2P
90
- name: MTEB CLSClusteringP2P
91
- config: default
92
- split: test
93
- revision: None
94
- metrics:
95
- - type: v_measure
96
- value: 39.538126779019755
97
- - task:
98
- type: Clustering
99
- dataset:
100
- type: C-MTEB/CLSClusteringS2S
101
- name: MTEB CLSClusteringS2S
102
- config: default
103
- split: test
104
- revision: None
105
- metrics:
106
- - type: v_measure
107
- value: 37.333105487031766
108
- - task:
109
- type: Reranking
110
- dataset:
111
- type: C-MTEB/CMedQAv1-reranking
112
- name: MTEB CMedQAv1
113
- config: default
114
- split: test
115
- revision: None
116
- metrics:
117
- - type: map
118
- value: 86.08142426347963
119
- - type: mrr
120
- value: 88.04269841269841
121
- - task:
122
- type: Reranking
123
- dataset:
124
- type: C-MTEB/CMedQAv2-reranking
125
- name: MTEB CMedQAv2
126
- config: default
127
- split: test
128
- revision: None
129
- metrics:
130
- - type: map
131
- value: 87.25694119382474
132
- - type: mrr
133
- value: 89.36853174603175
134
- - task:
135
- type: Retrieval
136
- dataset:
137
- type: C-MTEB/CmedqaRetrieval
138
- name: MTEB CmedqaRetrieval
139
- config: default
140
- split: dev
141
- revision: None
142
- metrics:
143
- - type: map_at_1
144
- value: 23.913999999999998
145
- - type: map_at_10
146
- value: 35.913000000000004
147
- - type: map_at_100
148
- value: 37.836
149
- - type: map_at_1000
150
- value: 37.952000000000005
151
- - type: map_at_3
152
- value: 31.845000000000002
153
- - type: map_at_5
154
- value: 34.0
155
- - type: mrr_at_1
156
- value: 36.884
157
- - type: mrr_at_10
158
- value: 44.872
159
- - type: mrr_at_100
160
- value: 45.899
161
- - type: mrr_at_1000
162
- value: 45.945
163
- - type: mrr_at_3
164
- value: 42.331
165
- - type: mrr_at_5
166
- value: 43.674
167
- - type: ndcg_at_1
168
- value: 36.884
169
- - type: ndcg_at_10
170
- value: 42.459
171
- - type: ndcg_at_100
172
- value: 50.046
173
- - type: ndcg_at_1000
174
- value: 52.092000000000006
175
- - type: ndcg_at_3
176
- value: 37.225
177
- - type: ndcg_at_5
178
- value: 39.2
179
- - type: precision_at_1
180
- value: 36.884
181
- - type: precision_at_10
182
- value: 9.562
183
- - type: precision_at_100
184
- value: 1.572
185
- - type: precision_at_1000
186
- value: 0.183
187
- - type: precision_at_3
188
- value: 21.122
189
- - type: precision_at_5
190
- value: 15.274
191
- - type: recall_at_1
192
- value: 23.913999999999998
193
- - type: recall_at_10
194
- value: 52.891999999999996
195
- - type: recall_at_100
196
- value: 84.328
197
- - type: recall_at_1000
198
- value: 98.168
199
- - type: recall_at_3
200
- value: 37.095
201
- - type: recall_at_5
202
- value: 43.396
203
- - task:
204
- type: PairClassification
205
- dataset:
206
- type: C-MTEB/CMNLI
207
- name: MTEB Cmnli
208
- config: default
209
- split: validation
210
- revision: None
211
- metrics:
212
- - type: cos_sim_accuracy
213
- value: 68.91160553217077
214
- - type: cos_sim_ap
215
- value: 76.45769658379533
216
- - type: cos_sim_f1
217
- value: 72.07988702844463
218
- - type: cos_sim_precision
219
- value: 63.384779137839274
220
- - type: cos_sim_recall
221
- value: 83.53986439092822
222
- - type: dot_accuracy
223
- value: 68.91160553217077
224
- - type: dot_ap
225
- value: 76.47279917239219
226
- - type: dot_f1
227
- value: 72.07988702844463
228
- - type: dot_precision
229
- value: 63.384779137839274
230
- - type: dot_recall
231
- value: 83.53986439092822
232
- - type: euclidean_accuracy
233
- value: 68.91160553217077
234
- - type: euclidean_ap
235
- value: 76.45768544225383
236
- - type: euclidean_f1
237
- value: 72.07988702844463
238
- - type: euclidean_precision
239
- value: 63.384779137839274
240
- - type: euclidean_recall
241
- value: 83.53986439092822
242
- - type: manhattan_accuracy
243
- value: 69.21226698737222
244
- - type: manhattan_ap
245
- value: 76.6623683693766
246
- - type: manhattan_f1
247
- value: 72.14058164628506
248
- - type: manhattan_precision
249
- value: 64.35643564356435
250
- - type: manhattan_recall
251
- value: 82.06686930091185
252
- - type: max_accuracy
253
- value: 69.21226698737222
254
- - type: max_ap
255
- value: 76.6623683693766
256
- - type: max_f1
257
- value: 72.14058164628506
258
- - task:
259
- type: Retrieval
260
- dataset:
261
- type: C-MTEB/CovidRetrieval
262
- name: MTEB CovidRetrieval
263
- config: default
264
- split: dev
265
- revision: None
266
- metrics:
267
- - type: map_at_1
268
- value: 48.419000000000004
269
- - type: map_at_10
270
- value: 57.367999999999995
271
- - type: map_at_100
272
- value: 58.081
273
- - type: map_at_1000
274
- value: 58.108000000000004
275
- - type: map_at_3
276
- value: 55.251
277
- - type: map_at_5
278
- value: 56.53399999999999
279
- - type: mrr_at_1
280
- value: 48.472
281
- - type: mrr_at_10
282
- value: 57.359
283
- - type: mrr_at_100
284
- value: 58.055
285
- - type: mrr_at_1000
286
- value: 58.082
287
- - type: mrr_at_3
288
- value: 55.303999999999995
289
- - type: mrr_at_5
290
- value: 56.542
291
- - type: ndcg_at_1
292
- value: 48.472
293
- - type: ndcg_at_10
294
- value: 61.651999999999994
295
- - type: ndcg_at_100
296
- value: 65.257
297
- - type: ndcg_at_1000
298
- value: 65.977
299
- - type: ndcg_at_3
300
- value: 57.401
301
- - type: ndcg_at_5
302
- value: 59.681
303
- - type: precision_at_1
304
- value: 48.472
305
- - type: precision_at_10
306
- value: 7.576
307
- - type: precision_at_100
308
- value: 0.932
309
- - type: precision_at_1000
310
- value: 0.099
311
- - type: precision_at_3
312
- value: 21.25
313
- - type: precision_at_5
314
- value: 13.888
315
- - type: recall_at_1
316
- value: 48.419000000000004
317
- - type: recall_at_10
318
- value: 74.97399999999999
319
- - type: recall_at_100
320
- value: 92.202
321
- - type: recall_at_1000
322
- value: 97.893
323
- - type: recall_at_3
324
- value: 63.541000000000004
325
- - type: recall_at_5
326
- value: 68.994
327
- - task:
328
- type: Retrieval
329
- dataset:
330
- type: C-MTEB/DuRetrieval
331
- name: MTEB DuRetrieval
332
- config: default
333
- split: dev
334
- revision: None
335
- metrics:
336
- - type: map_at_1
337
- value: 22.328
338
- - type: map_at_10
339
- value: 69.11
340
- - type: map_at_100
341
- value: 72.47
342
- - type: map_at_1000
343
- value: 72.54599999999999
344
- - type: map_at_3
345
- value: 46.938
346
- - type: map_at_5
347
- value: 59.56
348
- - type: mrr_at_1
349
- value: 81.35
350
- - type: mrr_at_10
351
- value: 87.066
352
- - type: mrr_at_100
353
- value: 87.212
354
- - type: mrr_at_1000
355
- value: 87.21799999999999
356
- - type: mrr_at_3
357
- value: 86.558
358
- - type: mrr_at_5
359
- value: 86.931
360
- - type: ndcg_at_1
361
- value: 81.35
362
- - type: ndcg_at_10
363
- value: 78.568
364
- - type: ndcg_at_100
365
- value: 82.86099999999999
366
- - type: ndcg_at_1000
367
- value: 83.628
368
- - type: ndcg_at_3
369
- value: 76.716
370
- - type: ndcg_at_5
371
- value: 75.664
372
- - type: precision_at_1
373
- value: 81.35
374
- - type: precision_at_10
375
- value: 38.545
376
- - type: precision_at_100
377
- value: 4.657
378
- - type: precision_at_1000
379
- value: 0.484
380
- - type: precision_at_3
381
- value: 69.18299999999999
382
- - type: precision_at_5
383
- value: 58.67
384
- - type: recall_at_1
385
- value: 22.328
386
- - type: recall_at_10
387
- value: 80.658
388
- - type: recall_at_100
389
- value: 94.093
390
- - type: recall_at_1000
391
- value: 98.137
392
- - type: recall_at_3
393
- value: 50.260000000000005
394
- - type: recall_at_5
395
- value: 66.045
396
- - task:
397
- type: Retrieval
398
- dataset:
399
- type: C-MTEB/EcomRetrieval
400
- name: MTEB EcomRetrieval
401
- config: default
402
- split: dev
403
- revision: None
404
- metrics:
405
- - type: map_at_1
406
- value: 43.1
407
- - type: map_at_10
408
- value: 52.872
409
- - type: map_at_100
410
- value: 53.556000000000004
411
- - type: map_at_1000
412
- value: 53.583000000000006
413
- - type: map_at_3
414
- value: 50.14999999999999
415
- - type: map_at_5
416
- value: 51.925
417
- - type: mrr_at_1
418
- value: 43.1
419
- - type: mrr_at_10
420
- value: 52.872
421
- - type: mrr_at_100
422
- value: 53.556000000000004
423
- - type: mrr_at_1000
424
- value: 53.583000000000006
425
- - type: mrr_at_3
426
- value: 50.14999999999999
427
- - type: mrr_at_5
428
- value: 51.925
429
- - type: ndcg_at_1
430
- value: 43.1
431
- - type: ndcg_at_10
432
- value: 57.907
433
- - type: ndcg_at_100
434
- value: 61.517999999999994
435
- - type: ndcg_at_1000
436
- value: 62.175000000000004
437
- - type: ndcg_at_3
438
- value: 52.425
439
- - type: ndcg_at_5
440
- value: 55.631
441
- - type: precision_at_1
442
- value: 43.1
443
- - type: precision_at_10
444
- value: 7.380000000000001
445
- - type: precision_at_100
446
- value: 0.9129999999999999
447
- - type: precision_at_1000
448
- value: 0.096
449
- - type: precision_at_3
450
- value: 19.667
451
- - type: precision_at_5
452
- value: 13.36
453
- - type: recall_at_1
454
- value: 43.1
455
- - type: recall_at_10
456
- value: 73.8
457
- - type: recall_at_100
458
- value: 91.3
459
- - type: recall_at_1000
460
- value: 96.39999999999999
461
- - type: recall_at_3
462
- value: 59.0
463
- - type: recall_at_5
464
- value: 66.8
465
- - task:
466
- type: Classification
467
- dataset:
468
- type: C-MTEB/IFlyTek-classification
469
- name: MTEB IFlyTek
470
- config: default
471
- split: validation
472
- revision: None
473
- metrics:
474
- - type: accuracy
475
- value: 41.146594844170835
476
- - type: f1
477
- value: 28.544218732704845
478
- - task:
479
- type: Classification
480
- dataset:
481
- type: C-MTEB/JDReview-classification
482
- name: MTEB JDReview
483
- config: default
484
- split: test
485
- revision: None
486
- metrics:
487
- - type: accuracy
488
- value: 82.83302063789868
489
- - type: ap
490
- value: 48.881798834997056
491
- - type: f1
492
- value: 77.28655923994657
493
- - task:
494
- type: STS
495
- dataset:
496
- type: C-MTEB/LCQMC
497
- name: MTEB LCQMC
498
- config: default
499
- split: test
500
- revision: None
501
- metrics:
502
- - type: cos_sim_pearson
503
- value: 66.05467125345538
504
- - type: cos_sim_spearman
505
- value: 72.71921060562211
506
- - type: euclidean_pearson
507
- value: 71.28539457113986
508
- - type: euclidean_spearman
509
- value: 72.71920173126693
510
- - type: manhattan_pearson
511
- value: 71.23750818174456
512
- - type: manhattan_spearman
513
- value: 72.61025268693467
514
- - task:
515
- type: Reranking
516
- dataset:
517
- type: C-MTEB/Mmarco-reranking
518
- name: MTEB MMarcoReranking
519
- config: default
520
- split: dev
521
- revision: None
522
- metrics:
523
- - type: map
524
- value: 26.127712982639483
525
- - type: mrr
526
- value: 24.87420634920635
527
- - task:
528
- type: Retrieval
529
- dataset:
530
- type: C-MTEB/MMarcoRetrieval
531
- name: MTEB MMarcoRetrieval
532
- config: default
533
- split: dev
534
- revision: None
535
- metrics:
536
- - type: map_at_1
537
- value: 62.517
538
- - type: map_at_10
539
- value: 71.251
540
- - type: map_at_100
541
- value: 71.647
542
- - type: map_at_1000
543
- value: 71.665
544
- - type: map_at_3
545
- value: 69.28
546
- - type: map_at_5
547
- value: 70.489
548
- - type: mrr_at_1
549
- value: 64.613
550
- - type: mrr_at_10
551
- value: 71.89
552
- - type: mrr_at_100
553
- value: 72.243
554
- - type: mrr_at_1000
555
- value: 72.259
556
- - type: mrr_at_3
557
- value: 70.138
558
- - type: mrr_at_5
559
- value: 71.232
560
- - type: ndcg_at_1
561
- value: 64.613
562
- - type: ndcg_at_10
563
- value: 75.005
564
- - type: ndcg_at_100
565
- value: 76.805
566
- - type: ndcg_at_1000
567
- value: 77.281
568
- - type: ndcg_at_3
569
- value: 71.234
570
- - type: ndcg_at_5
571
- value: 73.294
572
- - type: precision_at_1
573
- value: 64.613
574
- - type: precision_at_10
575
- value: 9.142
576
- - type: precision_at_100
577
- value: 1.004
578
- - type: precision_at_1000
579
- value: 0.104
580
- - type: precision_at_3
581
- value: 26.781
582
- - type: precision_at_5
583
- value: 17.149
584
- - type: recall_at_1
585
- value: 62.517
586
- - type: recall_at_10
587
- value: 85.997
588
- - type: recall_at_100
589
- value: 94.18299999999999
590
- - type: recall_at_1000
591
- value: 97.911
592
- - type: recall_at_3
593
- value: 75.993
594
- - type: recall_at_5
595
- value: 80.88300000000001
596
- - task:
597
- type: Classification
598
- dataset:
599
- type: mteb/amazon_massive_intent
600
- name: MTEB MassiveIntentClassification (zh-CN)
601
- config: zh-CN
602
- split: test
603
- revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
604
- metrics:
605
- - type: accuracy
606
- value: 59.27706792199058
607
- - type: f1
608
- value: 56.77545011902468
609
- - task:
610
- type: Classification
611
- dataset:
612
- type: mteb/amazon_massive_scenario
613
- name: MTEB MassiveScenarioClassification (zh-CN)
614
- config: zh-CN
615
- split: test
616
- revision: 7d571f92784cd94a019292a1f45445077d0ef634
617
- metrics:
618
- - type: accuracy
619
- value: 66.47948890383321
620
- - type: f1
621
- value: 66.4502180376861
622
- - task:
623
- type: Retrieval
624
- dataset:
625
- type: C-MTEB/MedicalRetrieval
626
- name: MTEB MedicalRetrieval
627
- config: default
628
- split: dev
629
- revision: None
630
- metrics:
631
- - type: map_at_1
632
- value: 54.2
633
- - type: map_at_10
634
- value: 59.858
635
- - type: map_at_100
636
- value: 60.46
637
- - type: map_at_1000
638
- value: 60.507
639
- - type: map_at_3
640
- value: 58.416999999999994
641
- - type: map_at_5
642
- value: 59.331999999999994
643
- - type: mrr_at_1
644
- value: 54.2
645
- - type: mrr_at_10
646
- value: 59.862
647
- - type: mrr_at_100
648
- value: 60.463
649
- - type: mrr_at_1000
650
- value: 60.51
651
- - type: mrr_at_3
652
- value: 58.416999999999994
653
- - type: mrr_at_5
654
- value: 59.352000000000004
655
- - type: ndcg_at_1
656
- value: 54.2
657
- - type: ndcg_at_10
658
- value: 62.643
659
- - type: ndcg_at_100
660
- value: 65.731
661
- - type: ndcg_at_1000
662
- value: 67.096
663
- - type: ndcg_at_3
664
- value: 59.727
665
- - type: ndcg_at_5
666
- value: 61.375
667
- - type: precision_at_1
668
- value: 54.2
669
- - type: precision_at_10
670
- value: 7.140000000000001
671
- - type: precision_at_100
672
- value: 0.8619999999999999
673
- - type: precision_at_1000
674
- value: 0.097
675
- - type: precision_at_3
676
- value: 21.166999999999998
677
- - type: precision_at_5
678
- value: 13.5
679
- - type: recall_at_1
680
- value: 54.2
681
- - type: recall_at_10
682
- value: 71.39999999999999
683
- - type: recall_at_100
684
- value: 86.2
685
- - type: recall_at_1000
686
- value: 97.2
687
- - type: recall_at_3
688
- value: 63.5
689
- - type: recall_at_5
690
- value: 67.5
691
- - task:
692
- type: Classification
693
- dataset:
694
- type: C-MTEB/MultilingualSentiment-classification
695
- name: MTEB MultilingualSentiment
696
- config: default
697
- split: validation
698
- revision: None
699
- metrics:
700
- - type: accuracy
701
- value: 68.19666666666666
702
- - type: f1
703
- value: 67.58581661416034
704
- - task:
705
- type: PairClassification
706
- dataset:
707
- type: C-MTEB/OCNLI
708
- name: MTEB Ocnli
709
- config: default
710
- split: validation
711
- revision: None
712
- metrics:
713
- - type: cos_sim_accuracy
714
- value: 60.530590146182995
715
- - type: cos_sim_ap
716
- value: 63.53656091243922
717
- - type: cos_sim_f1
718
- value: 68.09929603556874
719
- - type: cos_sim_precision
720
- value: 52.45433789954338
721
- - type: cos_sim_recall
722
- value: 97.04329461457233
723
- - type: dot_accuracy
724
- value: 60.530590146182995
725
- - type: dot_ap
726
- value: 63.53660452157237
727
- - type: dot_f1
728
- value: 68.09929603556874
729
- - type: dot_precision
730
- value: 52.45433789954338
731
- - type: dot_recall
732
- value: 97.04329461457233
733
- - type: euclidean_accuracy
734
- value: 60.530590146182995
735
- - type: euclidean_ap
736
- value: 63.53678735855631
737
- - type: euclidean_f1
738
- value: 68.09929603556874
739
- - type: euclidean_precision
740
- value: 52.45433789954338
741
- - type: euclidean_recall
742
- value: 97.04329461457233
743
- - type: manhattan_accuracy
744
- value: 60.47644829453167
745
- - type: manhattan_ap
746
- value: 63.5622508250315
747
- - type: manhattan_f1
748
- value: 68.1650700073692
749
- - type: manhattan_precision
750
- value: 52.34861346915677
751
- - type: manhattan_recall
752
- value: 97.67687434002113
753
- - type: max_accuracy
754
- value: 60.530590146182995
755
- - type: max_ap
756
- value: 63.5622508250315
757
- - type: max_f1
758
- value: 68.1650700073692
759
- - task:
760
- type: Classification
761
- dataset:
762
- type: C-MTEB/OnlineShopping-classification
763
- name: MTEB OnlineShopping
764
- config: default
765
- split: test
766
- revision: None
767
- metrics:
768
- - type: accuracy
769
- value: 89.13
770
- - type: ap
771
- value: 87.21879260137172
772
- - type: f1
773
- value: 89.12359325300508
774
- - task:
775
- type: STS
776
- dataset:
777
- type: C-MTEB/PAWSX
778
- name: MTEB PAWSX
779
- config: default
780
- split: test
781
- revision: None
782
- metrics:
783
- - type: cos_sim_pearson
784
- value: 12.035577637900758
785
- - type: cos_sim_spearman
786
- value: 12.76524190663864
787
- - type: euclidean_pearson
788
- value: 14.4012689427106
789
- - type: euclidean_spearman
790
- value: 12.765328992583608
791
- - type: manhattan_pearson
792
- value: 14.458505202938946
793
- - type: manhattan_spearman
794
- value: 12.763238700117896
795
- - task:
796
- type: STS
797
- dataset:
798
- type: C-MTEB/QBQTC
799
- name: MTEB QBQTC
800
- config: default
801
- split: test
802
- revision: None
803
- metrics:
804
- - type: cos_sim_pearson
805
- value: 34.809415339934006
806
- - type: cos_sim_spearman
807
- value: 36.96728615916954
808
- - type: euclidean_pearson
809
- value: 35.56113673772396
810
- - type: euclidean_spearman
811
- value: 36.96842963389308
812
- - type: manhattan_pearson
813
- value: 35.5447066178264
814
- - type: manhattan_spearman
815
- value: 36.97514513480951
816
- - task:
817
- type: STS
818
- dataset:
819
- type: mteb/sts22-crosslingual-sts
820
- name: MTEB STS22 (zh)
821
- config: zh
822
- split: test
823
- revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
824
- metrics:
825
- - type: cos_sim_pearson
826
- value: 66.39448692338551
827
- - type: cos_sim_spearman
828
- value: 66.72211526923901
829
- - type: euclidean_pearson
830
- value: 65.72981824553035
831
- - type: euclidean_spearman
832
- value: 66.72211526923901
833
- - type: manhattan_pearson
834
- value: 65.52315559414296
835
- - type: manhattan_spearman
836
- value: 66.61931702511545
837
- - task:
838
- type: STS
839
- dataset:
840
- type: C-MTEB/STSB
841
- name: MTEB STSB
842
- config: default
843
- split: test
844
- revision: None
845
- metrics:
846
- - type: cos_sim_pearson
847
- value: 76.73608064460915
848
- - type: cos_sim_spearman
849
- value: 76.51424826130031
850
- - type: euclidean_pearson
851
- value: 76.17930213372487
852
- - type: euclidean_spearman
853
- value: 76.51342756283478
854
- - type: manhattan_pearson
855
- value: 75.87085607319342
856
- - type: manhattan_spearman
857
- value: 76.22676341477134
858
- - task:
859
- type: Reranking
860
- dataset:
861
- type: C-MTEB/T2Reranking
862
- name: MTEB T2Reranking
863
- config: default
864
- split: dev
865
- revision: None
866
- metrics:
867
- - type: map
868
- value: 65.38779931543048
869
- - type: mrr
870
- value: 74.79313763420059
871
- - task:
872
- type: Retrieval
873
- dataset:
874
- type: C-MTEB/T2Retrieval
875
- name: MTEB T2Retrieval
876
- config: default
877
- split: dev
878
- revision: None
879
- metrics:
880
- - type: map_at_1
881
- value: 25.131999999999998
882
- - type: map_at_10
883
- value: 69.131
884
- - type: map_at_100
885
- value: 72.943
886
- - type: map_at_1000
887
- value: 73.045
888
- - type: map_at_3
889
- value: 48.847
890
- - type: map_at_5
891
- value: 59.842
892
- - type: mrr_at_1
893
- value: 85.516
894
- - type: mrr_at_10
895
- value: 88.863
896
- - type: mrr_at_100
897
- value: 88.996
898
- - type: mrr_at_1000
899
- value: 89.00099999999999
900
- - type: mrr_at_3
901
- value: 88.277
902
- - type: mrr_at_5
903
- value: 88.64800000000001
904
- - type: ndcg_at_1
905
- value: 85.516
906
- - type: ndcg_at_10
907
- value: 78.122
908
- - type: ndcg_at_100
909
- value: 82.673
910
- - type: ndcg_at_1000
911
- value: 83.707
912
- - type: ndcg_at_3
913
- value: 80.274
914
- - type: ndcg_at_5
915
- value: 78.405
916
- - type: precision_at_1
917
- value: 85.516
918
- - type: precision_at_10
919
- value: 38.975
920
- - type: precision_at_100
921
- value: 4.833
922
- - type: precision_at_1000
923
- value: 0.509
924
- - type: precision_at_3
925
- value: 70.35
926
- - type: precision_at_5
927
- value: 58.638
928
- - type: recall_at_1
929
- value: 25.131999999999998
930
- - type: recall_at_10
931
- value: 76.848
932
- - type: recall_at_100
933
- value: 91.489
934
- - type: recall_at_1000
935
- value: 96.709
936
- - type: recall_at_3
937
- value: 50.824000000000005
938
- - type: recall_at_5
939
- value: 63.89
940
- - task:
941
- type: Classification
942
- dataset:
943
- type: C-MTEB/TNews-classification
944
- name: MTEB TNews
945
- config: default
946
- split: validation
947
- revision: None
948
- metrics:
949
- - type: accuracy
950
- value: 49.65
951
- - type: f1
952
- value: 47.66791473245483
953
- - task:
954
- type: Clustering
955
- dataset:
956
- type: C-MTEB/ThuNewsClusteringP2P
957
- name: MTEB ThuNewsClusteringP2P
958
- config: default
959
- split: test
960
- revision: None
961
- metrics:
962
- - type: v_measure
963
- value: 63.78843565968542
964
- - task:
965
- type: Clustering
966
- dataset:
967
- type: C-MTEB/ThuNewsClusteringS2S
968
- name: MTEB ThuNewsClusteringS2S
969
- config: default
970
- split: test
971
- revision: None
972
- metrics:
973
- - type: v_measure
974
- value: 55.14095244943176
975
- - task:
976
- type: Retrieval
977
- dataset:
978
- type: C-MTEB/VideoRetrieval
979
- name: MTEB VideoRetrieval
980
- config: default
981
- split: dev
982
- revision: None
983
- metrics:
984
- - type: map_at_1
985
- value: 53.800000000000004
986
- - type: map_at_10
987
- value: 63.312000000000005
988
- - type: map_at_100
989
- value: 63.93600000000001
990
- - type: map_at_1000
991
- value: 63.955
992
- - type: map_at_3
993
- value: 61.283
994
- - type: map_at_5
995
- value: 62.553000000000004
996
- - type: mrr_at_1
997
- value: 53.800000000000004
998
- - type: mrr_at_10
999
- value: 63.312000000000005
1000
- - type: mrr_at_100
1001
- value: 63.93600000000001
1002
- - type: mrr_at_1000
1003
- value: 63.955
1004
- - type: mrr_at_3
1005
- value: 61.283
1006
- - type: mrr_at_5
1007
- value: 62.553000000000004
1008
- - type: ndcg_at_1
1009
- value: 53.800000000000004
1010
- - type: ndcg_at_10
1011
- value: 67.693
1012
- - type: ndcg_at_100
1013
- value: 70.552
1014
- - type: ndcg_at_1000
1015
- value: 71.06099999999999
1016
- - type: ndcg_at_3
1017
- value: 63.632
1018
- - type: ndcg_at_5
1019
- value: 65.90899999999999
1020
- - type: precision_at_1
1021
- value: 53.800000000000004
1022
- - type: precision_at_10
1023
- value: 8.129999999999999
1024
- - type: precision_at_100
1025
- value: 0.943
1026
- - type: precision_at_1000
1027
- value: 0.098
1028
- - type: precision_at_3
1029
- value: 23.467
1030
- - type: precision_at_5
1031
- value: 15.18
1032
- - type: recall_at_1
1033
- value: 53.800000000000004
1034
- - type: recall_at_10
1035
- value: 81.3
1036
- - type: recall_at_100
1037
- value: 94.3
1038
- - type: recall_at_1000
1039
- value: 98.3
1040
- - type: recall_at_3
1041
- value: 70.39999999999999
1042
- - type: recall_at_5
1043
- value: 75.9
1044
- - task:
1045
- type: Classification
1046
- dataset:
1047
- type: C-MTEB/waimai-classification
1048
- name: MTEB Waimai
1049
- config: default
1050
- split: test
1051
- revision: None
1052
- metrics:
1053
- - type: accuracy
1054
- value: 84.96000000000001
1055
- - type: ap
1056
- value: 66.89917287702019
1057
- - type: f1
1058
- value: 83.0239988458119
1059
- language:
1060
- - en
1061
- license: mit
1062
- ---
1063
-
1064
- # gte-small-zh
1065
-
1066
- General Text Embeddings (GTE) model. [Towards General Text Embeddings with Multi-stage Contrastive Learning](https://arxiv.org/abs/2308.03281)
1067
-
1068
- The GTE models are trained by Alibaba DAMO Academy. They are mainly based on the BERT framework and currently offer different sizes of models for both Chinese and English Languages. The GTE models are trained on a large-scale corpus of relevance text pairs, covering a wide range of domains and scenarios. This enables the GTE models to be applied to various downstream tasks of text embeddings, including **information retrieval**, **semantic textual similarity**, **text reranking**, etc.
1069
-
1070
- ## Model List
1071
-
1072
- | Models | Language | Max Sequence Length | Dimension | Model Size |
1073
- |:-----: | :-----: |:-----: |:-----: |:-----: |
1074
- |[GTE-large-zh](https://huggingface.co/thenlper/gte-large-zh) | Chinese | 512 | 1024 | 0.67GB |
1075
- |[GTE-base-zh](https://huggingface.co/thenlper/gte-base-zh) | Chinese | 512 | 512 | 0.21GB |
1076
- |[GTE-small-zh](https://huggingface.co/thenlper/gte-small-zh) | Chinese | 512 | 512 | 0.10GB |
1077
- |[GTE-large](https://huggingface.co/thenlper/gte-large) | English | 512 | 1024 | 0.67GB |
1078
- |[GTE-base](https://huggingface.co/thenlper/gte-base) | English | 512 | 512 | 0.21GB |
1079
- |[GTE-small](https://huggingface.co/thenlper/gte-small) | English | 512 | 384 | 0.10GB |
1080
-
1081
- ## Metrics
1082
-
1083
- We compared the performance of the GTE models with other popular text embedding models on the MTEB (CMTEB for Chinese language) benchmark. For more detailed comparison results, please refer to the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
1084
-
1085
- - Evaluation results on CMTEB
1086
-
1087
- | Model | Model Size (GB) | Embedding Dimensions | Sequence Length | Average (35 datasets) | Classification (9 datasets) | Clustering (4 datasets) | Pair Classification (2 datasets) | Reranking (4 datasets) | Retrieval (8 datasets) | STS (8 datasets) |
1088
- | ------------------- | -------------- | -------------------- | ---------------- | --------------------- | ------------------------------------ | ------------------------------ | --------------------------------------- | ------------------------------ | ---------------------------- | ------------------------ |
1089
- | **gte-large-zh** | 0.65 | 1024 | 512 | **66.72** | 71.34 | 53.07 | 81.14 | 67.42 | 72.49 | 57.82 |
1090
- | gte-base-zh | 0.20 | 768 | 512 | 65.92 | 71.26 | 53.86 | 80.44 | 67.00 | 71.71 | 55.96 |
1091
- | stella-large-zh-v2 | 0.65 | 1024 | 1024 | 65.13 | 69.05 | 49.16 | 82.68 | 66.41 | 70.14 | 58.66 |
1092
- | stella-large-zh | 0.65 | 1024 | 1024 | 64.54 | 67.62 | 48.65 | 78.72 | 65.98 | 71.02 | 58.3 |
1093
- | bge-large-zh-v1.5 | 1.3 | 1024 | 512 | 64.53 | 69.13 | 48.99 | 81.6 | 65.84 | 70.46 | 56.25 |
1094
- | stella-base-zh-v2 | 0.21 | 768 | 1024 | 64.36 | 68.29 | 49.4 | 79.96 | 66.1 | 70.08 | 56.92 |
1095
- | stella-base-zh | 0.21 | 768 | 1024 | 64.16 | 67.77 | 48.7 | 76.09 | 66.95 | 71.07 | 56.54 |
1096
- | piccolo-large-zh | 0.65 | 1024 | 512 | 64.11 | 67.03 | 47.04 | 78.38 | 65.98 | 70.93 | 58.02 |
1097
- | piccolo-base-zh | 0.2 | 768 | 512 | 63.66 | 66.98 | 47.12 | 76.61 | 66.68 | 71.2 | 55.9 |
1098
- | gte-small-zh | 0.1 | 512 | 512 | 60.04 | 64.35 | 48.95 | 69.99 | 66.21 | 65.50 | 49.72 |
1099
- | bge-small-zh-v1.5 | 0.1 | 512 | 512 | 57.82 | 63.96 | 44.18 | 70.4 | 60.92 | 61.77 | 49.1 |
1100
- | m3e-base | 0.41 | 768 | 512 | 57.79 | 67.52 | 47.68 | 63.99 | 59.54| 56.91 | 50.47 |
1101
- |text-embedding-ada-002(openai) | - | 1536| 8192 | 53.02 | 64.31 | 45.68 | 69.56 | 54.28 | 52.0 | 43.35 |
1102
-
1103
-
1104
- ## Usage
1105
-
1106
- Code example
1107
-
1108
- ```python
1109
- import torch.nn.functional as F
1110
- from torch import Tensor
1111
- from transformers import AutoTokenizer, AutoModel
1112
-
1113
- input_texts = [
1114
- "中国的首都是哪里",
1115
- "你喜欢去哪里旅游",
1116
- "北京",
1117
- "今天中午吃什么"
1118
- ]
1119
-
1120
- tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small-zh")
1121
- model = AutoModel.from_pretrained("thenlper/gte-small-zh")
1122
-
1123
- # Tokenize the input texts
1124
- batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
1125
-
1126
- outputs = model(**batch_dict)
1127
- embeddings = outputs.last_hidden_state[:, 0]
1128
-
1129
- # (Optionally) normalize embeddings
1130
- embeddings = F.normalize(embeddings, p=2, dim=1)
1131
- scores = (embeddings[:1] @ embeddings[1:].T) * 100
1132
- print(scores.tolist())
1133
- ```
1134
-
1135
- Use with sentence-transformers:
1136
-
1137
- ```python
1138
- from sentence_transformers import SentenceTransformer
1139
- from sentence_transformers.util import cos_sim
1140
-
1141
- sentences = ['That is a happy person', 'That is a very happy person']
1142
-
1143
- model = SentenceTransformer('thenlper/gte-small-zh')
1144
- embeddings = model.encode(sentences)
1145
- print(cos_sim(embeddings[0], embeddings[1]))
1146
- ```
1147
-
1148
- ### Limitation
1149
-
1150
- This model exclusively caters to Chinese texts, and any lengthy texts will be truncated to a maximum of 512 tokens.
1151
-
1152
- ### Citation
1153
-
1154
- If you find our paper or models helpful, please consider citing them as follows:
1155
-
1156
- ```
1157
- @article{li2023towards,
1158
- title={Towards general text embeddings with multi-stage contrastive learning},
1159
- author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
1160
- journal={arXiv preprint arXiv:2308.03281},
1161
- year={2023}
1162
- }
 
 
 
 
 
 
 
 
1163
  ```
 
1
+ ---
2
+ tags:
3
+ - mteb
4
+ - sentence-similarity
5
+ - sentence-transformers
6
+ - Sentence Transformers
7
+ model-index:
8
+ - name: gte-small-zh
9
+ results:
10
+ - task:
11
+ type: STS
12
+ dataset:
13
+ type: C-MTEB/AFQMC
14
+ name: MTEB AFQMC
15
+ config: default
16
+ split: validation
17
+ revision: None
18
+ metrics:
19
+ - type: cos_sim_pearson
20
+ value: 35.80906032378281
21
+ - type: cos_sim_spearman
22
+ value: 36.688967176174415
23
+ - type: euclidean_pearson
24
+ value: 35.70701955438158
25
+ - type: euclidean_spearman
26
+ value: 36.6889470691436
27
+ - type: manhattan_pearson
28
+ value: 35.832741768286944
29
+ - type: manhattan_spearman
30
+ value: 36.831888591957195
31
+ - task:
32
+ type: STS
33
+ dataset:
34
+ type: C-MTEB/ATEC
35
+ name: MTEB ATEC
36
+ config: default
37
+ split: test
38
+ revision: None
39
+ metrics:
40
+ - type: cos_sim_pearson
41
+ value: 44.667266488330384
42
+ - type: cos_sim_spearman
43
+ value: 45.77390794946174
44
+ - type: euclidean_pearson
45
+ value: 48.14272832901943
46
+ - type: euclidean_spearman
47
+ value: 45.77390569666109
48
+ - type: manhattan_pearson
49
+ value: 48.187667158563094
50
+ - type: manhattan_spearman
51
+ value: 45.80979161966117
52
+ - task:
53
+ type: Classification
54
+ dataset:
55
+ type: mteb/amazon_reviews_multi
56
+ name: MTEB AmazonReviewsClassification (zh)
57
+ config: zh
58
+ split: test
59
+ revision: 1399c76144fd37290681b995c656ef9b2e06e26d
60
+ metrics:
61
+ - type: accuracy
62
+ value: 38.690000000000005
63
+ - type: f1
64
+ value: 36.868257131984016
65
+ - task:
66
+ type: STS
67
+ dataset:
68
+ type: C-MTEB/BQ
69
+ name: MTEB BQ
70
+ config: default
71
+ split: test
72
+ revision: None
73
+ metrics:
74
+ - type: cos_sim_pearson
75
+ value: 49.03674224607541
76
+ - type: cos_sim_spearman
77
+ value: 49.63568854885055
78
+ - type: euclidean_pearson
79
+ value: 49.47441886441355
80
+ - type: euclidean_spearman
81
+ value: 49.63567815431205
82
+ - type: manhattan_pearson
83
+ value: 49.76480072909559
84
+ - type: manhattan_spearman
85
+ value: 49.977789367288224
86
+ - task:
87
+ type: Clustering
88
+ dataset:
89
+ type: C-MTEB/CLSClusteringP2P
90
+ name: MTEB CLSClusteringP2P
91
+ config: default
92
+ split: test
93
+ revision: None
94
+ metrics:
95
+ - type: v_measure
96
+ value: 39.538126779019755
97
+ - task:
98
+ type: Clustering
99
+ dataset:
100
+ type: C-MTEB/CLSClusteringS2S
101
+ name: MTEB CLSClusteringS2S
102
+ config: default
103
+ split: test
104
+ revision: None
105
+ metrics:
106
+ - type: v_measure
107
+ value: 37.333105487031766
108
+ - task:
109
+ type: Reranking
110
+ dataset:
111
+ type: C-MTEB/CMedQAv1-reranking
112
+ name: MTEB CMedQAv1
113
+ config: default
114
+ split: test
115
+ revision: None
116
+ metrics:
117
+ - type: map
118
+ value: 86.08142426347963
119
+ - type: mrr
120
+ value: 88.04269841269841
121
+ - task:
122
+ type: Reranking
123
+ dataset:
124
+ type: C-MTEB/CMedQAv2-reranking
125
+ name: MTEB CMedQAv2
126
+ config: default
127
+ split: test
128
+ revision: None
129
+ metrics:
130
+ - type: map
131
+ value: 87.25694119382474
132
+ - type: mrr
133
+ value: 89.36853174603175
134
+ - task:
135
+ type: Retrieval
136
+ dataset:
137
+ type: C-MTEB/CmedqaRetrieval
138
+ name: MTEB CmedqaRetrieval
139
+ config: default
140
+ split: dev
141
+ revision: None
142
+ metrics:
143
+ - type: map_at_1
144
+ value: 23.913999999999998
145
+ - type: map_at_10
146
+ value: 35.913000000000004
147
+ - type: map_at_100
148
+ value: 37.836
149
+ - type: map_at_1000
150
+ value: 37.952000000000005
151
+ - type: map_at_3
152
+ value: 31.845000000000002
153
+ - type: map_at_5
154
+ value: 34.0
155
+ - type: mrr_at_1
156
+ value: 36.884
157
+ - type: mrr_at_10
158
+ value: 44.872
159
+ - type: mrr_at_100
160
+ value: 45.899
161
+ - type: mrr_at_1000
162
+ value: 45.945
163
+ - type: mrr_at_3
164
+ value: 42.331
165
+ - type: mrr_at_5
166
+ value: 43.674
167
+ - type: ndcg_at_1
168
+ value: 36.884
169
+ - type: ndcg_at_10
170
+ value: 42.459
171
+ - type: ndcg_at_100
172
+ value: 50.046
173
+ - type: ndcg_at_1000
174
+ value: 52.092000000000006
175
+ - type: ndcg_at_3
176
+ value: 37.225
177
+ - type: ndcg_at_5
178
+ value: 39.2
179
+ - type: precision_at_1
180
+ value: 36.884
181
+ - type: precision_at_10
182
+ value: 9.562
183
+ - type: precision_at_100
184
+ value: 1.572
185
+ - type: precision_at_1000
186
+ value: 0.183
187
+ - type: precision_at_3
188
+ value: 21.122
189
+ - type: precision_at_5
190
+ value: 15.274
191
+ - type: recall_at_1
192
+ value: 23.913999999999998
193
+ - type: recall_at_10
194
+ value: 52.891999999999996
195
+ - type: recall_at_100
196
+ value: 84.328
197
+ - type: recall_at_1000
198
+ value: 98.168
199
+ - type: recall_at_3
200
+ value: 37.095
201
+ - type: recall_at_5
202
+ value: 43.396
203
+ - task:
204
+ type: PairClassification
205
+ dataset:
206
+ type: C-MTEB/CMNLI
207
+ name: MTEB Cmnli
208
+ config: default
209
+ split: validation
210
+ revision: None
211
+ metrics:
212
+ - type: cos_sim_accuracy
213
+ value: 68.91160553217077
214
+ - type: cos_sim_ap
215
+ value: 76.45769658379533
216
+ - type: cos_sim_f1
217
+ value: 72.07988702844463
218
+ - type: cos_sim_precision
219
+ value: 63.384779137839274
220
+ - type: cos_sim_recall
221
+ value: 83.53986439092822
222
+ - type: dot_accuracy
223
+ value: 68.91160553217077
224
+ - type: dot_ap
225
+ value: 76.47279917239219
226
+ - type: dot_f1
227
+ value: 72.07988702844463
228
+ - type: dot_precision
229
+ value: 63.384779137839274
230
+ - type: dot_recall
231
+ value: 83.53986439092822
232
+ - type: euclidean_accuracy
233
+ value: 68.91160553217077
234
+ - type: euclidean_ap
235
+ value: 76.45768544225383
236
+ - type: euclidean_f1
237
+ value: 72.07988702844463
238
+ - type: euclidean_precision
239
+ value: 63.384779137839274
240
+ - type: euclidean_recall
241
+ value: 83.53986439092822
242
+ - type: manhattan_accuracy
243
+ value: 69.21226698737222
244
+ - type: manhattan_ap
245
+ value: 76.6623683693766
246
+ - type: manhattan_f1
247
+ value: 72.14058164628506
248
+ - type: manhattan_precision
249
+ value: 64.35643564356435
250
+ - type: manhattan_recall
251
+ value: 82.06686930091185
252
+ - type: max_accuracy
253
+ value: 69.21226698737222
254
+ - type: max_ap
255
+ value: 76.6623683693766
256
+ - type: max_f1
257
+ value: 72.14058164628506
258
+ - task:
259
+ type: Retrieval
260
+ dataset:
261
+ type: C-MTEB/CovidRetrieval
262
+ name: MTEB CovidRetrieval
263
+ config: default
264
+ split: dev
265
+ revision: None
266
+ metrics:
267
+ - type: map_at_1
268
+ value: 48.419000000000004
269
+ - type: map_at_10
270
+ value: 57.367999999999995
271
+ - type: map_at_100
272
+ value: 58.081
273
+ - type: map_at_1000
274
+ value: 58.108000000000004
275
+ - type: map_at_3
276
+ value: 55.251
277
+ - type: map_at_5
278
+ value: 56.53399999999999
279
+ - type: mrr_at_1
280
+ value: 48.472
281
+ - type: mrr_at_10
282
+ value: 57.359
283
+ - type: mrr_at_100
284
+ value: 58.055
285
+ - type: mrr_at_1000
286
+ value: 58.082
287
+ - type: mrr_at_3
288
+ value: 55.303999999999995
289
+ - type: mrr_at_5
290
+ value: 56.542
291
+ - type: ndcg_at_1
292
+ value: 48.472
293
+ - type: ndcg_at_10
294
+ value: 61.651999999999994
295
+ - type: ndcg_at_100
296
+ value: 65.257
297
+ - type: ndcg_at_1000
298
+ value: 65.977
299
+ - type: ndcg_at_3
300
+ value: 57.401
301
+ - type: ndcg_at_5
302
+ value: 59.681
303
+ - type: precision_at_1
304
+ value: 48.472
305
+ - type: precision_at_10
306
+ value: 7.576
307
+ - type: precision_at_100
308
+ value: 0.932
309
+ - type: precision_at_1000
310
+ value: 0.099
311
+ - type: precision_at_3
312
+ value: 21.25
313
+ - type: precision_at_5
314
+ value: 13.888
315
+ - type: recall_at_1
316
+ value: 48.419000000000004
317
+ - type: recall_at_10
318
+ value: 74.97399999999999
319
+ - type: recall_at_100
320
+ value: 92.202
321
+ - type: recall_at_1000
322
+ value: 97.893
323
+ - type: recall_at_3
324
+ value: 63.541000000000004
325
+ - type: recall_at_5
326
+ value: 68.994
327
+ - task:
328
+ type: Retrieval
329
+ dataset:
330
+ type: C-MTEB/DuRetrieval
331
+ name: MTEB DuRetrieval
332
+ config: default
333
+ split: dev
334
+ revision: None
335
+ metrics:
336
+ - type: map_at_1
337
+ value: 22.328
338
+ - type: map_at_10
339
+ value: 69.11
340
+ - type: map_at_100
341
+ value: 72.47
342
+ - type: map_at_1000
343
+ value: 72.54599999999999
344
+ - type: map_at_3
345
+ value: 46.938
346
+ - type: map_at_5
347
+ value: 59.56
348
+ - type: mrr_at_1
349
+ value: 81.35
350
+ - type: mrr_at_10
351
+ value: 87.066
352
+ - type: mrr_at_100
353
+ value: 87.212
354
+ - type: mrr_at_1000
355
+ value: 87.21799999999999
356
+ - type: mrr_at_3
357
+ value: 86.558
358
+ - type: mrr_at_5
359
+ value: 86.931
360
+ - type: ndcg_at_1
361
+ value: 81.35
362
+ - type: ndcg_at_10
363
+ value: 78.568
364
+ - type: ndcg_at_100
365
+ value: 82.86099999999999
366
+ - type: ndcg_at_1000
367
+ value: 83.628
368
+ - type: ndcg_at_3
369
+ value: 76.716
370
+ - type: ndcg_at_5
371
+ value: 75.664
372
+ - type: precision_at_1
373
+ value: 81.35
374
+ - type: precision_at_10
375
+ value: 38.545
376
+ - type: precision_at_100
377
+ value: 4.657
378
+ - type: precision_at_1000
379
+ value: 0.484
380
+ - type: precision_at_3
381
+ value: 69.18299999999999
382
+ - type: precision_at_5
383
+ value: 58.67
384
+ - type: recall_at_1
385
+ value: 22.328
386
+ - type: recall_at_10
387
+ value: 80.658
388
+ - type: recall_at_100
389
+ value: 94.093
390
+ - type: recall_at_1000
391
+ value: 98.137
392
+ - type: recall_at_3
393
+ value: 50.260000000000005
394
+ - type: recall_at_5
395
+ value: 66.045
396
+ - task:
397
+ type: Retrieval
398
+ dataset:
399
+ type: C-MTEB/EcomRetrieval
400
+ name: MTEB EcomRetrieval
401
+ config: default
402
+ split: dev
403
+ revision: None
404
+ metrics:
405
+ - type: map_at_1
406
+ value: 43.1
407
+ - type: map_at_10
408
+ value: 52.872
409
+ - type: map_at_100
410
+ value: 53.556000000000004
411
+ - type: map_at_1000
412
+ value: 53.583000000000006
413
+ - type: map_at_3
414
+ value: 50.14999999999999
415
+ - type: map_at_5
416
+ value: 51.925
417
+ - type: mrr_at_1
418
+ value: 43.1
419
+ - type: mrr_at_10
420
+ value: 52.872
421
+ - type: mrr_at_100
422
+ value: 53.556000000000004
423
+ - type: mrr_at_1000
424
+ value: 53.583000000000006
425
+ - type: mrr_at_3
426
+ value: 50.14999999999999
427
+ - type: mrr_at_5
428
+ value: 51.925
429
+ - type: ndcg_at_1
430
+ value: 43.1
431
+ - type: ndcg_at_10
432
+ value: 57.907
433
+ - type: ndcg_at_100
434
+ value: 61.517999999999994
435
+ - type: ndcg_at_1000
436
+ value: 62.175000000000004
437
+ - type: ndcg_at_3
438
+ value: 52.425
439
+ - type: ndcg_at_5
440
+ value: 55.631
441
+ - type: precision_at_1
442
+ value: 43.1
443
+ - type: precision_at_10
444
+ value: 7.380000000000001
445
+ - type: precision_at_100
446
+ value: 0.9129999999999999
447
+ - type: precision_at_1000
448
+ value: 0.096
449
+ - type: precision_at_3
450
+ value: 19.667
451
+ - type: precision_at_5
452
+ value: 13.36
453
+ - type: recall_at_1
454
+ value: 43.1
455
+ - type: recall_at_10
456
+ value: 73.8
457
+ - type: recall_at_100
458
+ value: 91.3
459
+ - type: recall_at_1000
460
+ value: 96.39999999999999
461
+ - type: recall_at_3
462
+ value: 59.0
463
+ - type: recall_at_5
464
+ value: 66.8
465
+ - task:
466
+ type: Classification
467
+ dataset:
468
+ type: C-MTEB/IFlyTek-classification
469
+ name: MTEB IFlyTek
470
+ config: default
471
+ split: validation
472
+ revision: None
473
+ metrics:
474
+ - type: accuracy
475
+ value: 41.146594844170835
476
+ - type: f1
477
+ value: 28.544218732704845
478
+ - task:
479
+ type: Classification
480
+ dataset:
481
+ type: C-MTEB/JDReview-classification
482
+ name: MTEB JDReview
483
+ config: default
484
+ split: test
485
+ revision: None
486
+ metrics:
487
+ - type: accuracy
488
+ value: 82.83302063789868
489
+ - type: ap
490
+ value: 48.881798834997056
491
+ - type: f1
492
+ value: 77.28655923994657
493
+ - task:
494
+ type: STS
495
+ dataset:
496
+ type: C-MTEB/LCQMC
497
+ name: MTEB LCQMC
498
+ config: default
499
+ split: test
500
+ revision: None
501
+ metrics:
502
+ - type: cos_sim_pearson
503
+ value: 66.05467125345538
504
+ - type: cos_sim_spearman
505
+ value: 72.71921060562211
506
+ - type: euclidean_pearson
507
+ value: 71.28539457113986
508
+ - type: euclidean_spearman
509
+ value: 72.71920173126693
510
+ - type: manhattan_pearson
511
+ value: 71.23750818174456
512
+ - type: manhattan_spearman
513
+ value: 72.61025268693467
514
+ - task:
515
+ type: Reranking
516
+ dataset:
517
+ type: C-MTEB/Mmarco-reranking
518
+ name: MTEB MMarcoReranking
519
+ config: default
520
+ split: dev
521
+ revision: None
522
+ metrics:
523
+ - type: map
524
+ value: 26.127712982639483
525
+ - type: mrr
526
+ value: 24.87420634920635
527
+ - task:
528
+ type: Retrieval
529
+ dataset:
530
+ type: C-MTEB/MMarcoRetrieval
531
+ name: MTEB MMarcoRetrieval
532
+ config: default
533
+ split: dev
534
+ revision: None
535
+ metrics:
536
+ - type: map_at_1
537
+ value: 62.517
538
+ - type: map_at_10
539
+ value: 71.251
540
+ - type: map_at_100
541
+ value: 71.647
542
+ - type: map_at_1000
543
+ value: 71.665
544
+ - type: map_at_3
545
+ value: 69.28
546
+ - type: map_at_5
547
+ value: 70.489
548
+ - type: mrr_at_1
549
+ value: 64.613
550
+ - type: mrr_at_10
551
+ value: 71.89
552
+ - type: mrr_at_100
553
+ value: 72.243
554
+ - type: mrr_at_1000
555
+ value: 72.259
556
+ - type: mrr_at_3
557
+ value: 70.138
558
+ - type: mrr_at_5
559
+ value: 71.232
560
+ - type: ndcg_at_1
561
+ value: 64.613
562
+ - type: ndcg_at_10
563
+ value: 75.005
564
+ - type: ndcg_at_100
565
+ value: 76.805
566
+ - type: ndcg_at_1000
567
+ value: 77.281
568
+ - type: ndcg_at_3
569
+ value: 71.234
570
+ - type: ndcg_at_5
571
+ value: 73.294
572
+ - type: precision_at_1
573
+ value: 64.613
574
+ - type: precision_at_10
575
+ value: 9.142
576
+ - type: precision_at_100
577
+ value: 1.004
578
+ - type: precision_at_1000
579
+ value: 0.104
580
+ - type: precision_at_3
581
+ value: 26.781
582
+ - type: precision_at_5
583
+ value: 17.149
584
+ - type: recall_at_1
585
+ value: 62.517
586
+ - type: recall_at_10
587
+ value: 85.997
588
+ - type: recall_at_100
589
+ value: 94.18299999999999
590
+ - type: recall_at_1000
591
+ value: 97.911
592
+ - type: recall_at_3
593
+ value: 75.993
594
+ - type: recall_at_5
595
+ value: 80.88300000000001
596
+ - task:
597
+ type: Classification
598
+ dataset:
599
+ type: mteb/amazon_massive_intent
600
+ name: MTEB MassiveIntentClassification (zh-CN)
601
+ config: zh-CN
602
+ split: test
603
+ revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
604
+ metrics:
605
+ - type: accuracy
606
+ value: 59.27706792199058
607
+ - type: f1
608
+ value: 56.77545011902468
609
+ - task:
610
+ type: Classification
611
+ dataset:
612
+ type: mteb/amazon_massive_scenario
613
+ name: MTEB MassiveScenarioClassification (zh-CN)
614
+ config: zh-CN
615
+ split: test
616
+ revision: 7d571f92784cd94a019292a1f45445077d0ef634
617
+ metrics:
618
+ - type: accuracy
619
+ value: 66.47948890383321
620
+ - type: f1
621
+ value: 66.4502180376861
622
+ - task:
623
+ type: Retrieval
624
+ dataset:
625
+ type: C-MTEB/MedicalRetrieval
626
+ name: MTEB MedicalRetrieval
627
+ config: default
628
+ split: dev
629
+ revision: None
630
+ metrics:
631
+ - type: map_at_1
632
+ value: 54.2
633
+ - type: map_at_10
634
+ value: 59.858
635
+ - type: map_at_100
636
+ value: 60.46
637
+ - type: map_at_1000
638
+ value: 60.507
639
+ - type: map_at_3
640
+ value: 58.416999999999994
641
+ - type: map_at_5
642
+ value: 59.331999999999994
643
+ - type: mrr_at_1
644
+ value: 54.2
645
+ - type: mrr_at_10
646
+ value: 59.862
647
+ - type: mrr_at_100
648
+ value: 60.463
649
+ - type: mrr_at_1000
650
+ value: 60.51
651
+ - type: mrr_at_3
652
+ value: 58.416999999999994
653
+ - type: mrr_at_5
654
+ value: 59.352000000000004
655
+ - type: ndcg_at_1
656
+ value: 54.2
657
+ - type: ndcg_at_10
658
+ value: 62.643
659
+ - type: ndcg_at_100
660
+ value: 65.731
661
+ - type: ndcg_at_1000
662
+ value: 67.096
663
+ - type: ndcg_at_3
664
+ value: 59.727
665
+ - type: ndcg_at_5
666
+ value: 61.375
667
+ - type: precision_at_1
668
+ value: 54.2
669
+ - type: precision_at_10
670
+ value: 7.140000000000001
671
+ - type: precision_at_100
672
+ value: 0.8619999999999999
673
+ - type: precision_at_1000
674
+ value: 0.097
675
+ - type: precision_at_3
676
+ value: 21.166999999999998
677
+ - type: precision_at_5
678
+ value: 13.5
679
+ - type: recall_at_1
680
+ value: 54.2
681
+ - type: recall_at_10
682
+ value: 71.39999999999999
683
+ - type: recall_at_100
684
+ value: 86.2
685
+ - type: recall_at_1000
686
+ value: 97.2
687
+ - type: recall_at_3
688
+ value: 63.5
689
+ - type: recall_at_5
690
+ value: 67.5
691
+ - task:
692
+ type: Classification
693
+ dataset:
694
+ type: C-MTEB/MultilingualSentiment-classification
695
+ name: MTEB MultilingualSentiment
696
+ config: default
697
+ split: validation
698
+ revision: None
699
+ metrics:
700
+ - type: accuracy
701
+ value: 68.19666666666666
702
+ - type: f1
703
+ value: 67.58581661416034
704
+ - task:
705
+ type: PairClassification
706
+ dataset:
707
+ type: C-MTEB/OCNLI
708
+ name: MTEB Ocnli
709
+ config: default
710
+ split: validation
711
+ revision: None
712
+ metrics:
713
+ - type: cos_sim_accuracy
714
+ value: 60.530590146182995
715
+ - type: cos_sim_ap
716
+ value: 63.53656091243922
717
+ - type: cos_sim_f1
718
+ value: 68.09929603556874
719
+ - type: cos_sim_precision
720
+ value: 52.45433789954338
721
+ - type: cos_sim_recall
722
+ value: 97.04329461457233
723
+ - type: dot_accuracy
724
+ value: 60.530590146182995
725
+ - type: dot_ap
726
+ value: 63.53660452157237
727
+ - type: dot_f1
728
+ value: 68.09929603556874
729
+ - type: dot_precision
730
+ value: 52.45433789954338
731
+ - type: dot_recall
732
+ value: 97.04329461457233
733
+ - type: euclidean_accuracy
734
+ value: 60.530590146182995
735
+ - type: euclidean_ap
736
+ value: 63.53678735855631
737
+ - type: euclidean_f1
738
+ value: 68.09929603556874
739
+ - type: euclidean_precision
740
+ value: 52.45433789954338
741
+ - type: euclidean_recall
742
+ value: 97.04329461457233
743
+ - type: manhattan_accuracy
744
+ value: 60.47644829453167
745
+ - type: manhattan_ap
746
+ value: 63.5622508250315
747
+ - type: manhattan_f1
748
+ value: 68.1650700073692
749
+ - type: manhattan_precision
750
+ value: 52.34861346915677
751
+ - type: manhattan_recall
752
+ value: 97.67687434002113
753
+ - type: max_accuracy
754
+ value: 60.530590146182995
755
+ - type: max_ap
756
+ value: 63.5622508250315
757
+ - type: max_f1
758
+ value: 68.1650700073692
759
+ - task:
760
+ type: Classification
761
+ dataset:
762
+ type: C-MTEB/OnlineShopping-classification
763
+ name: MTEB OnlineShopping
764
+ config: default
765
+ split: test
766
+ revision: None
767
+ metrics:
768
+ - type: accuracy
769
+ value: 89.13
770
+ - type: ap
771
+ value: 87.21879260137172
772
+ - type: f1
773
+ value: 89.12359325300508
774
+ - task:
775
+ type: STS
776
+ dataset:
777
+ type: C-MTEB/PAWSX
778
+ name: MTEB PAWSX
779
+ config: default
780
+ split: test
781
+ revision: None
782
+ metrics:
783
+ - type: cos_sim_pearson
784
+ value: 12.035577637900758
785
+ - type: cos_sim_spearman
786
+ value: 12.76524190663864
787
+ - type: euclidean_pearson
788
+ value: 14.4012689427106
789
+ - type: euclidean_spearman
790
+ value: 12.765328992583608
791
+ - type: manhattan_pearson
792
+ value: 14.458505202938946
793
+ - type: manhattan_spearman
794
+ value: 12.763238700117896
795
+ - task:
796
+ type: STS
797
+ dataset:
798
+ type: C-MTEB/QBQTC
799
+ name: MTEB QBQTC
800
+ config: default
801
+ split: test
802
+ revision: None
803
+ metrics:
804
+ - type: cos_sim_pearson
805
+ value: 34.809415339934006
806
+ - type: cos_sim_spearman
807
+ value: 36.96728615916954
808
+ - type: euclidean_pearson
809
+ value: 35.56113673772396
810
+ - type: euclidean_spearman
811
+ value: 36.96842963389308
812
+ - type: manhattan_pearson
813
+ value: 35.5447066178264
814
+ - type: manhattan_spearman
815
+ value: 36.97514513480951
816
+ - task:
817
+ type: STS
818
+ dataset:
819
+ type: mteb/sts22-crosslingual-sts
820
+ name: MTEB STS22 (zh)
821
+ config: zh
822
+ split: test
823
+ revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
824
+ metrics:
825
+ - type: cos_sim_pearson
826
+ value: 66.39448692338551
827
+ - type: cos_sim_spearman
828
+ value: 66.72211526923901
829
+ - type: euclidean_pearson
830
+ value: 65.72981824553035
831
+ - type: euclidean_spearman
832
+ value: 66.72211526923901
833
+ - type: manhattan_pearson
834
+ value: 65.52315559414296
835
+ - type: manhattan_spearman
836
+ value: 66.61931702511545
837
+ - task:
838
+ type: STS
839
+ dataset:
840
+ type: C-MTEB/STSB
841
+ name: MTEB STSB
842
+ config: default
843
+ split: test
844
+ revision: None
845
+ metrics:
846
+ - type: cos_sim_pearson
847
+ value: 76.73608064460915
848
+ - type: cos_sim_spearman
849
+ value: 76.51424826130031
850
+ - type: euclidean_pearson
851
+ value: 76.17930213372487
852
+ - type: euclidean_spearman
853
+ value: 76.51342756283478
854
+ - type: manhattan_pearson
855
+ value: 75.87085607319342
856
+ - type: manhattan_spearman
857
+ value: 76.22676341477134
858
+ - task:
859
+ type: Reranking
860
+ dataset:
861
+ type: C-MTEB/T2Reranking
862
+ name: MTEB T2Reranking
863
+ config: default
864
+ split: dev
865
+ revision: None
866
+ metrics:
867
+ - type: map
868
+ value: 65.38779931543048
869
+ - type: mrr
870
+ value: 74.79313763420059
871
+ - task:
872
+ type: Retrieval
873
+ dataset:
874
+ type: C-MTEB/T2Retrieval
875
+ name: MTEB T2Retrieval
876
+ config: default
877
+ split: dev
878
+ revision: None
879
+ metrics:
880
+ - type: map_at_1
881
+ value: 25.131999999999998
882
+ - type: map_at_10
883
+ value: 69.131
884
+ - type: map_at_100
885
+ value: 72.943
886
+ - type: map_at_1000
887
+ value: 73.045
888
+ - type: map_at_3
889
+ value: 48.847
890
+ - type: map_at_5
891
+ value: 59.842
892
+ - type: mrr_at_1
893
+ value: 85.516
894
+ - type: mrr_at_10
895
+ value: 88.863
896
+ - type: mrr_at_100
897
+ value: 88.996
898
+ - type: mrr_at_1000
899
+ value: 89.00099999999999
900
+ - type: mrr_at_3
901
+ value: 88.277
902
+ - type: mrr_at_5
903
+ value: 88.64800000000001
904
+ - type: ndcg_at_1
905
+ value: 85.516
906
+ - type: ndcg_at_10
907
+ value: 78.122
908
+ - type: ndcg_at_100
909
+ value: 82.673
910
+ - type: ndcg_at_1000
911
+ value: 83.707
912
+ - type: ndcg_at_3
913
+ value: 80.274
914
+ - type: ndcg_at_5
915
+ value: 78.405
916
+ - type: precision_at_1
917
+ value: 85.516
918
+ - type: precision_at_10
919
+ value: 38.975
920
+ - type: precision_at_100
921
+ value: 4.833
922
+ - type: precision_at_1000
923
+ value: 0.509
924
+ - type: precision_at_3
925
+ value: 70.35
926
+ - type: precision_at_5
927
+ value: 58.638
928
+ - type: recall_at_1
929
+ value: 25.131999999999998
930
+ - type: recall_at_10
931
+ value: 76.848
932
+ - type: recall_at_100
933
+ value: 91.489
934
+ - type: recall_at_1000
935
+ value: 96.709
936
+ - type: recall_at_3
937
+ value: 50.824000000000005
938
+ - type: recall_at_5
939
+ value: 63.89
940
+ - task:
941
+ type: Classification
942
+ dataset:
943
+ type: C-MTEB/TNews-classification
944
+ name: MTEB TNews
945
+ config: default
946
+ split: validation
947
+ revision: None
948
+ metrics:
949
+ - type: accuracy
950
+ value: 49.65
951
+ - type: f1
952
+ value: 47.66791473245483
953
+ - task:
954
+ type: Clustering
955
+ dataset:
956
+ type: C-MTEB/ThuNewsClusteringP2P
957
+ name: MTEB ThuNewsClusteringP2P
958
+ config: default
959
+ split: test
960
+ revision: None
961
+ metrics:
962
+ - type: v_measure
963
+ value: 63.78843565968542
964
+ - task:
965
+ type: Clustering
966
+ dataset:
967
+ type: C-MTEB/ThuNewsClusteringS2S
968
+ name: MTEB ThuNewsClusteringS2S
969
+ config: default
970
+ split: test
971
+ revision: None
972
+ metrics:
973
+ - type: v_measure
974
+ value: 55.14095244943176
975
+ - task:
976
+ type: Retrieval
977
+ dataset:
978
+ type: C-MTEB/VideoRetrieval
979
+ name: MTEB VideoRetrieval
980
+ config: default
981
+ split: dev
982
+ revision: None
983
+ metrics:
984
+ - type: map_at_1
985
+ value: 53.800000000000004
986
+ - type: map_at_10
987
+ value: 63.312000000000005
988
+ - type: map_at_100
989
+ value: 63.93600000000001
990
+ - type: map_at_1000
991
+ value: 63.955
992
+ - type: map_at_3
993
+ value: 61.283
994
+ - type: map_at_5
995
+ value: 62.553000000000004
996
+ - type: mrr_at_1
997
+ value: 53.800000000000004
998
+ - type: mrr_at_10
999
+ value: 63.312000000000005
1000
+ - type: mrr_at_100
1001
+ value: 63.93600000000001
1002
+ - type: mrr_at_1000
1003
+ value: 63.955
1004
+ - type: mrr_at_3
1005
+ value: 61.283
1006
+ - type: mrr_at_5
1007
+ value: 62.553000000000004
1008
+ - type: ndcg_at_1
1009
+ value: 53.800000000000004
1010
+ - type: ndcg_at_10
1011
+ value: 67.693
1012
+ - type: ndcg_at_100
1013
+ value: 70.552
1014
+ - type: ndcg_at_1000
1015
+ value: 71.06099999999999
1016
+ - type: ndcg_at_3
1017
+ value: 63.632
1018
+ - type: ndcg_at_5
1019
+ value: 65.90899999999999
1020
+ - type: precision_at_1
1021
+ value: 53.800000000000004
1022
+ - type: precision_at_10
1023
+ value: 8.129999999999999
1024
+ - type: precision_at_100
1025
+ value: 0.943
1026
+ - type: precision_at_1000
1027
+ value: 0.098
1028
+ - type: precision_at_3
1029
+ value: 23.467
1030
+ - type: precision_at_5
1031
+ value: 15.18
1032
+ - type: recall_at_1
1033
+ value: 53.800000000000004
1034
+ - type: recall_at_10
1035
+ value: 81.3
1036
+ - type: recall_at_100
1037
+ value: 94.3
1038
+ - type: recall_at_1000
1039
+ value: 98.3
1040
+ - type: recall_at_3
1041
+ value: 70.39999999999999
1042
+ - type: recall_at_5
1043
+ value: 75.9
1044
+ - task:
1045
+ type: Classification
1046
+ dataset:
1047
+ type: C-MTEB/waimai-classification
1048
+ name: MTEB Waimai
1049
+ config: default
1050
+ split: test
1051
+ revision: None
1052
+ metrics:
1053
+ - type: accuracy
1054
+ value: 84.96000000000001
1055
+ - type: ap
1056
+ value: 66.89917287702019
1057
+ - type: f1
1058
+ value: 83.0239988458119
1059
+ language:
1060
+ - en
1061
+ license: mit
1062
+ ---
1063
+
1064
+
1065
+ ---
1066
+
1067
+
1068
+ *Converted and quantized [thenlper/gte-small-zh](https://huggingface.co/thenlper/gte-small-zh) ONNX model for use with transformer.js.*
1069
+
1070
+ ---
1071
+
1072
+ # gte-small-zh
1073
+
1074
+ General Text Embeddings (GTE) model. [Towards General Text Embeddings with Multi-stage Contrastive Learning](https://arxiv.org/abs/2308.03281)
1075
+
1076
+ The GTE models are trained by Alibaba DAMO Academy. They are mainly based on the BERT framework and currently offer different sizes of models for both Chinese and English Languages. The GTE models are trained on a large-scale corpus of relevance text pairs, covering a wide range of domains and scenarios. This enables the GTE models to be applied to various downstream tasks of text embeddings, including **information retrieval**, **semantic textual similarity**, **text reranking**, etc.
1077
+
1078
+ ## Model List
1079
+
1080
+ | Models | Language | Max Sequence Length | Dimension | Model Size |
1081
+ |:-----: | :-----: |:-----: |:-----: |:-----: |
1082
+ |[GTE-large-zh](https://huggingface.co/thenlper/gte-large-zh) | Chinese | 512 | 1024 | 0.67GB |
1083
+ |[GTE-base-zh](https://huggingface.co/thenlper/gte-base-zh) | Chinese | 512 | 512 | 0.21GB |
1084
+ |[GTE-small-zh](https://huggingface.co/thenlper/gte-small-zh) | Chinese | 512 | 512 | 0.10GB |
1085
+ |[GTE-large](https://huggingface.co/thenlper/gte-large) | English | 512 | 1024 | 0.67GB |
1086
+ |[GTE-base](https://huggingface.co/thenlper/gte-base) | English | 512 | 512 | 0.21GB |
1087
+ |[GTE-small](https://huggingface.co/thenlper/gte-small) | English | 512 | 384 | 0.10GB |
1088
+
1089
+ ## Metrics
1090
+
1091
+ We compared the performance of the GTE models with other popular text embedding models on the MTEB (CMTEB for Chinese language) benchmark. For more detailed comparison results, please refer to the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
1092
+
1093
+ - Evaluation results on CMTEB
1094
+
1095
+ | Model | Model Size (GB) | Embedding Dimensions | Sequence Length | Average (35 datasets) | Classification (9 datasets) | Clustering (4 datasets) | Pair Classification (2 datasets) | Reranking (4 datasets) | Retrieval (8 datasets) | STS (8 datasets) |
1096
+ | ------------------- | -------------- | -------------------- | ---------------- | --------------------- | ------------------------------------ | ------------------------------ | --------------------------------------- | ------------------------------ | ---------------------------- | ------------------------ |
1097
+ | **gte-large-zh** | 0.65 | 1024 | 512 | **66.72** | 71.34 | 53.07 | 81.14 | 67.42 | 72.49 | 57.82 |
1098
+ | gte-base-zh | 0.20 | 768 | 512 | 65.92 | 71.26 | 53.86 | 80.44 | 67.00 | 71.71 | 55.96 |
1099
+ | stella-large-zh-v2 | 0.65 | 1024 | 1024 | 65.13 | 69.05 | 49.16 | 82.68 | 66.41 | 70.14 | 58.66 |
1100
+ | stella-large-zh | 0.65 | 1024 | 1024 | 64.54 | 67.62 | 48.65 | 78.72 | 65.98 | 71.02 | 58.3 |
1101
+ | bge-large-zh-v1.5 | 1.3 | 1024 | 512 | 64.53 | 69.13 | 48.99 | 81.6 | 65.84 | 70.46 | 56.25 |
1102
+ | stella-base-zh-v2 | 0.21 | 768 | 1024 | 64.36 | 68.29 | 49.4 | 79.96 | 66.1 | 70.08 | 56.92 |
1103
+ | stella-base-zh | 0.21 | 768 | 1024 | 64.16 | 67.77 | 48.7 | 76.09 | 66.95 | 71.07 | 56.54 |
1104
+ | piccolo-large-zh | 0.65 | 1024 | 512 | 64.11 | 67.03 | 47.04 | 78.38 | 65.98 | 70.93 | 58.02 |
1105
+ | piccolo-base-zh | 0.2 | 768 | 512 | 63.66 | 66.98 | 47.12 | 76.61 | 66.68 | 71.2 | 55.9 |
1106
+ | gte-small-zh | 0.1 | 512 | 512 | 60.04 | 64.35 | 48.95 | 69.99 | 66.21 | 65.50 | 49.72 |
1107
+ | bge-small-zh-v1.5 | 0.1 | 512 | 512 | 57.82 | 63.96 | 44.18 | 70.4 | 60.92 | 61.77 | 49.1 |
1108
+ | m3e-base | 0.41 | 768 | 512 | 57.79 | 67.52 | 47.68 | 63.99 | 59.54| 56.91 | 50.47 |
1109
+ |text-embedding-ada-002(openai) | - | 1536| 8192 | 53.02 | 64.31 | 45.68 | 69.56 | 54.28 | 52.0 | 43.35 |
1110
+
1111
+
1112
+ ## Usage
1113
+
1114
+ Code example
1115
+
1116
+ ```python
1117
+ import torch.nn.functional as F
1118
+ from torch import Tensor
1119
+ from transformers import AutoTokenizer, AutoModel
1120
+
1121
+ input_texts = [
1122
+ "中国的首都是哪里",
1123
+ "你喜欢去哪里旅游",
1124
+ "北京",
1125
+ "今天中午吃什么"
1126
+ ]
1127
+
1128
+ tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small-zh")
1129
+ model = AutoModel.from_pretrained("thenlper/gte-small-zh")
1130
+
1131
+ # Tokenize the input texts
1132
+ batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
1133
+
1134
+ outputs = model(**batch_dict)
1135
+ embeddings = outputs.last_hidden_state[:, 0]
1136
+
1137
+ # (Optionally) normalize embeddings
1138
+ embeddings = F.normalize(embeddings, p=2, dim=1)
1139
+ scores = (embeddings[:1] @ embeddings[1:].T) * 100
1140
+ print(scores.tolist())
1141
+ ```
1142
+
1143
+ Use with sentence-transformers:
1144
+
1145
+ ```python
1146
+ from sentence_transformers import SentenceTransformer
1147
+ from sentence_transformers.util import cos_sim
1148
+
1149
+ sentences = ['That is a happy person', 'That is a very happy person']
1150
+
1151
+ model = SentenceTransformer('thenlper/gte-small-zh')
1152
+ embeddings = model.encode(sentences)
1153
+ print(cos_sim(embeddings[0], embeddings[1]))
1154
+ ```
1155
+
1156
+ ### Limitation
1157
+
1158
+ This model exclusively caters to Chinese texts, and any lengthy texts will be truncated to a maximum of 512 tokens.
1159
+
1160
+ ### Citation
1161
+
1162
+ If you find our paper or models helpful, please consider citing them as follows:
1163
+
1164
+ ```
1165
+ @article{li2023towards,
1166
+ title={Towards general text embeddings with multi-stage contrastive learning},
1167
+ author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
1168
+ journal={arXiv preprint arXiv:2308.03281},
1169
+ year={2023}
1170
+ }
1171
  ```