Xenova HF staff commited on
Commit
99200a9
1 Parent(s): d46e7a3

Upload folder using huggingface_hub

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
README.md ADDED
@@ -0,0 +1,1268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - feature-extraction
5
+ - sentence-similarity
6
+ - mteb
7
+ inference: false
8
+ license: apache-2.0
9
+ language:
10
+ - en
11
+ - zh
12
+ model-index:
13
+ - name: jina-embeddings-v2-base-zh
14
+ results:
15
+ - task:
16
+ type: STS
17
+ dataset:
18
+ type: C-MTEB/AFQMC
19
+ name: MTEB AFQMC
20
+ config: default
21
+ split: validation
22
+ revision: None
23
+ metrics:
24
+ - type: cos_sim_pearson
25
+ value: 48.51403119231363
26
+ - type: cos_sim_spearman
27
+ value: 50.5928547846445
28
+ - type: euclidean_pearson
29
+ value: 48.750436310559074
30
+ - type: euclidean_spearman
31
+ value: 50.50950238691385
32
+ - type: manhattan_pearson
33
+ value: 48.7866189440328
34
+ - type: manhattan_spearman
35
+ value: 50.58692402017165
36
+ - task:
37
+ type: STS
38
+ dataset:
39
+ type: C-MTEB/ATEC
40
+ name: MTEB ATEC
41
+ config: default
42
+ split: test
43
+ revision: None
44
+ metrics:
45
+ - type: cos_sim_pearson
46
+ value: 50.25985700105725
47
+ - type: cos_sim_spearman
48
+ value: 51.28815934593989
49
+ - type: euclidean_pearson
50
+ value: 52.70329248799904
51
+ - type: euclidean_spearman
52
+ value: 50.94101139559258
53
+ - type: manhattan_pearson
54
+ value: 52.6647237400892
55
+ - type: manhattan_spearman
56
+ value: 50.922441325406176
57
+ - task:
58
+ type: Classification
59
+ dataset:
60
+ type: mteb/amazon_reviews_multi
61
+ name: MTEB AmazonReviewsClassification (zh)
62
+ config: zh
63
+ split: test
64
+ revision: 1399c76144fd37290681b995c656ef9b2e06e26d
65
+ metrics:
66
+ - type: accuracy
67
+ value: 34.944
68
+ - type: f1
69
+ value: 34.06478860660109
70
+ - task:
71
+ type: STS
72
+ dataset:
73
+ type: C-MTEB/BQ
74
+ name: MTEB BQ
75
+ config: default
76
+ split: test
77
+ revision: None
78
+ metrics:
79
+ - type: cos_sim_pearson
80
+ value: 65.15667035488342
81
+ - type: cos_sim_spearman
82
+ value: 66.07110142081
83
+ - type: euclidean_pearson
84
+ value: 60.447598102249714
85
+ - type: euclidean_spearman
86
+ value: 61.826575796578766
87
+ - type: manhattan_pearson
88
+ value: 60.39364279354984
89
+ - type: manhattan_spearman
90
+ value: 61.78743491223281
91
+ - task:
92
+ type: Clustering
93
+ dataset:
94
+ type: C-MTEB/CLSClusteringP2P
95
+ name: MTEB CLSClusteringP2P
96
+ config: default
97
+ split: test
98
+ revision: None
99
+ metrics:
100
+ - type: v_measure
101
+ value: 39.96714175391701
102
+ - task:
103
+ type: Clustering
104
+ dataset:
105
+ type: C-MTEB/CLSClusteringS2S
106
+ name: MTEB CLSClusteringS2S
107
+ config: default
108
+ split: test
109
+ revision: None
110
+ metrics:
111
+ - type: v_measure
112
+ value: 38.39863566717934
113
+ - task:
114
+ type: Reranking
115
+ dataset:
116
+ type: C-MTEB/CMedQAv1-reranking
117
+ name: MTEB CMedQAv1
118
+ config: default
119
+ split: test
120
+ revision: None
121
+ metrics:
122
+ - type: map
123
+ value: 83.63680381780644
124
+ - type: mrr
125
+ value: 86.16476190476192
126
+ - task:
127
+ type: Reranking
128
+ dataset:
129
+ type: C-MTEB/CMedQAv2-reranking
130
+ name: MTEB CMedQAv2
131
+ config: default
132
+ split: test
133
+ revision: None
134
+ metrics:
135
+ - type: map
136
+ value: 83.74350667859487
137
+ - type: mrr
138
+ value: 86.10388888888889
139
+ - task:
140
+ type: Retrieval
141
+ dataset:
142
+ type: C-MTEB/CmedqaRetrieval
143
+ name: MTEB CmedqaRetrieval
144
+ config: default
145
+ split: dev
146
+ revision: None
147
+ metrics:
148
+ - type: map_at_1
149
+ value: 22.072
150
+ - type: map_at_10
151
+ value: 32.942
152
+ - type: map_at_100
153
+ value: 34.768
154
+ - type: map_at_1000
155
+ value: 34.902
156
+ - type: map_at_3
157
+ value: 29.357
158
+ - type: map_at_5
159
+ value: 31.236000000000004
160
+ - type: mrr_at_1
161
+ value: 34.259
162
+ - type: mrr_at_10
163
+ value: 41.957
164
+ - type: mrr_at_100
165
+ value: 42.982
166
+ - type: mrr_at_1000
167
+ value: 43.042
168
+ - type: mrr_at_3
169
+ value: 39.722
170
+ - type: mrr_at_5
171
+ value: 40.898
172
+ - type: ndcg_at_1
173
+ value: 34.259
174
+ - type: ndcg_at_10
175
+ value: 39.153
176
+ - type: ndcg_at_100
177
+ value: 46.493
178
+ - type: ndcg_at_1000
179
+ value: 49.01
180
+ - type: ndcg_at_3
181
+ value: 34.636
182
+ - type: ndcg_at_5
183
+ value: 36.278
184
+ - type: precision_at_1
185
+ value: 34.259
186
+ - type: precision_at_10
187
+ value: 8.815000000000001
188
+ - type: precision_at_100
189
+ value: 1.474
190
+ - type: precision_at_1000
191
+ value: 0.179
192
+ - type: precision_at_3
193
+ value: 19.73
194
+ - type: precision_at_5
195
+ value: 14.174000000000001
196
+ - type: recall_at_1
197
+ value: 22.072
198
+ - type: recall_at_10
199
+ value: 48.484
200
+ - type: recall_at_100
201
+ value: 79.035
202
+ - type: recall_at_1000
203
+ value: 96.15
204
+ - type: recall_at_3
205
+ value: 34.607
206
+ - type: recall_at_5
207
+ value: 40.064
208
+ - task:
209
+ type: PairClassification
210
+ dataset:
211
+ type: C-MTEB/CMNLI
212
+ name: MTEB Cmnli
213
+ config: default
214
+ split: validation
215
+ revision: None
216
+ metrics:
217
+ - type: cos_sim_accuracy
218
+ value: 76.7047504509922
219
+ - type: cos_sim_ap
220
+ value: 85.26649874800871
221
+ - type: cos_sim_f1
222
+ value: 78.13528724646915
223
+ - type: cos_sim_precision
224
+ value: 71.57587548638132
225
+ - type: cos_sim_recall
226
+ value: 86.01823708206688
227
+ - type: dot_accuracy
228
+ value: 70.13830426939266
229
+ - type: dot_ap
230
+ value: 77.01510412382171
231
+ - type: dot_f1
232
+ value: 73.56710042713817
233
+ - type: dot_precision
234
+ value: 63.955094991364426
235
+ - type: dot_recall
236
+ value: 86.57937806873977
237
+ - type: euclidean_accuracy
238
+ value: 75.53818400481059
239
+ - type: euclidean_ap
240
+ value: 84.34668448241264
241
+ - type: euclidean_f1
242
+ value: 77.51741608613047
243
+ - type: euclidean_precision
244
+ value: 70.65614777756399
245
+ - type: euclidean_recall
246
+ value: 85.85457096095394
247
+ - type: manhattan_accuracy
248
+ value: 75.49007817197835
249
+ - type: manhattan_ap
250
+ value: 84.40297506704299
251
+ - type: manhattan_f1
252
+ value: 77.63185324160932
253
+ - type: manhattan_precision
254
+ value: 70.03949595636637
255
+ - type: manhattan_recall
256
+ value: 87.07037643207856
257
+ - type: max_accuracy
258
+ value: 76.7047504509922
259
+ - type: max_ap
260
+ value: 85.26649874800871
261
+ - type: max_f1
262
+ value: 78.13528724646915
263
+ - task:
264
+ type: Retrieval
265
+ dataset:
266
+ type: C-MTEB/CovidRetrieval
267
+ name: MTEB CovidRetrieval
268
+ config: default
269
+ split: dev
270
+ revision: None
271
+ metrics:
272
+ - type: map_at_1
273
+ value: 69.178
274
+ - type: map_at_10
275
+ value: 77.523
276
+ - type: map_at_100
277
+ value: 77.793
278
+ - type: map_at_1000
279
+ value: 77.79899999999999
280
+ - type: map_at_3
281
+ value: 75.878
282
+ - type: map_at_5
283
+ value: 76.849
284
+ - type: mrr_at_1
285
+ value: 69.44200000000001
286
+ - type: mrr_at_10
287
+ value: 77.55
288
+ - type: mrr_at_100
289
+ value: 77.819
290
+ - type: mrr_at_1000
291
+ value: 77.826
292
+ - type: mrr_at_3
293
+ value: 75.957
294
+ - type: mrr_at_5
295
+ value: 76.916
296
+ - type: ndcg_at_1
297
+ value: 69.44200000000001
298
+ - type: ndcg_at_10
299
+ value: 81.217
300
+ - type: ndcg_at_100
301
+ value: 82.45
302
+ - type: ndcg_at_1000
303
+ value: 82.636
304
+ - type: ndcg_at_3
305
+ value: 77.931
306
+ - type: ndcg_at_5
307
+ value: 79.655
308
+ - type: precision_at_1
309
+ value: 69.44200000000001
310
+ - type: precision_at_10
311
+ value: 9.357
312
+ - type: precision_at_100
313
+ value: 0.993
314
+ - type: precision_at_1000
315
+ value: 0.101
316
+ - type: precision_at_3
317
+ value: 28.1
318
+ - type: precision_at_5
319
+ value: 17.724
320
+ - type: recall_at_1
321
+ value: 69.178
322
+ - type: recall_at_10
323
+ value: 92.624
324
+ - type: recall_at_100
325
+ value: 98.209
326
+ - type: recall_at_1000
327
+ value: 99.684
328
+ - type: recall_at_3
329
+ value: 83.772
330
+ - type: recall_at_5
331
+ value: 87.882
332
+ - task:
333
+ type: Retrieval
334
+ dataset:
335
+ type: C-MTEB/DuRetrieval
336
+ name: MTEB DuRetrieval
337
+ config: default
338
+ split: dev
339
+ revision: None
340
+ metrics:
341
+ - type: map_at_1
342
+ value: 25.163999999999998
343
+ - type: map_at_10
344
+ value: 76.386
345
+ - type: map_at_100
346
+ value: 79.339
347
+ - type: map_at_1000
348
+ value: 79.39500000000001
349
+ - type: map_at_3
350
+ value: 52.959
351
+ - type: map_at_5
352
+ value: 66.59
353
+ - type: mrr_at_1
354
+ value: 87.9
355
+ - type: mrr_at_10
356
+ value: 91.682
357
+ - type: mrr_at_100
358
+ value: 91.747
359
+ - type: mrr_at_1000
360
+ value: 91.751
361
+ - type: mrr_at_3
362
+ value: 91.267
363
+ - type: mrr_at_5
364
+ value: 91.527
365
+ - type: ndcg_at_1
366
+ value: 87.9
367
+ - type: ndcg_at_10
368
+ value: 84.569
369
+ - type: ndcg_at_100
370
+ value: 87.83800000000001
371
+ - type: ndcg_at_1000
372
+ value: 88.322
373
+ - type: ndcg_at_3
374
+ value: 83.473
375
+ - type: ndcg_at_5
376
+ value: 82.178
377
+ - type: precision_at_1
378
+ value: 87.9
379
+ - type: precision_at_10
380
+ value: 40.605000000000004
381
+ - type: precision_at_100
382
+ value: 4.752
383
+ - type: precision_at_1000
384
+ value: 0.488
385
+ - type: precision_at_3
386
+ value: 74.9
387
+ - type: precision_at_5
388
+ value: 62.96000000000001
389
+ - type: recall_at_1
390
+ value: 25.163999999999998
391
+ - type: recall_at_10
392
+ value: 85.97399999999999
393
+ - type: recall_at_100
394
+ value: 96.63000000000001
395
+ - type: recall_at_1000
396
+ value: 99.016
397
+ - type: recall_at_3
398
+ value: 55.611999999999995
399
+ - type: recall_at_5
400
+ value: 71.936
401
+ - task:
402
+ type: Retrieval
403
+ dataset:
404
+ type: C-MTEB/EcomRetrieval
405
+ name: MTEB EcomRetrieval
406
+ config: default
407
+ split: dev
408
+ revision: None
409
+ metrics:
410
+ - type: map_at_1
411
+ value: 48.6
412
+ - type: map_at_10
413
+ value: 58.831
414
+ - type: map_at_100
415
+ value: 59.427
416
+ - type: map_at_1000
417
+ value: 59.44199999999999
418
+ - type: map_at_3
419
+ value: 56.383
420
+ - type: map_at_5
421
+ value: 57.753
422
+ - type: mrr_at_1
423
+ value: 48.6
424
+ - type: mrr_at_10
425
+ value: 58.831
426
+ - type: mrr_at_100
427
+ value: 59.427
428
+ - type: mrr_at_1000
429
+ value: 59.44199999999999
430
+ - type: mrr_at_3
431
+ value: 56.383
432
+ - type: mrr_at_5
433
+ value: 57.753
434
+ - type: ndcg_at_1
435
+ value: 48.6
436
+ - type: ndcg_at_10
437
+ value: 63.951
438
+ - type: ndcg_at_100
439
+ value: 66.72200000000001
440
+ - type: ndcg_at_1000
441
+ value: 67.13900000000001
442
+ - type: ndcg_at_3
443
+ value: 58.882
444
+ - type: ndcg_at_5
445
+ value: 61.373
446
+ - type: precision_at_1
447
+ value: 48.6
448
+ - type: precision_at_10
449
+ value: 8.01
450
+ - type: precision_at_100
451
+ value: 0.928
452
+ - type: precision_at_1000
453
+ value: 0.096
454
+ - type: precision_at_3
455
+ value: 22.033
456
+ - type: precision_at_5
457
+ value: 14.44
458
+ - type: recall_at_1
459
+ value: 48.6
460
+ - type: recall_at_10
461
+ value: 80.10000000000001
462
+ - type: recall_at_100
463
+ value: 92.80000000000001
464
+ - type: recall_at_1000
465
+ value: 96.1
466
+ - type: recall_at_3
467
+ value: 66.10000000000001
468
+ - type: recall_at_5
469
+ value: 72.2
470
+ - task:
471
+ type: Classification
472
+ dataset:
473
+ type: C-MTEB/IFlyTek-classification
474
+ name: MTEB IFlyTek
475
+ config: default
476
+ split: validation
477
+ revision: None
478
+ metrics:
479
+ - type: accuracy
480
+ value: 47.36437091188918
481
+ - type: f1
482
+ value: 36.60946954228577
483
+ - task:
484
+ type: Classification
485
+ dataset:
486
+ type: C-MTEB/JDReview-classification
487
+ name: MTEB JDReview
488
+ config: default
489
+ split: test
490
+ revision: None
491
+ metrics:
492
+ - type: accuracy
493
+ value: 79.5684803001876
494
+ - type: ap
495
+ value: 42.671935929201524
496
+ - type: f1
497
+ value: 73.31912729103752
498
+ - task:
499
+ type: STS
500
+ dataset:
501
+ type: C-MTEB/LCQMC
502
+ name: MTEB LCQMC
503
+ config: default
504
+ split: test
505
+ revision: None
506
+ metrics:
507
+ - type: cos_sim_pearson
508
+ value: 68.62670112113864
509
+ - type: cos_sim_spearman
510
+ value: 75.74009123170768
511
+ - type: euclidean_pearson
512
+ value: 73.93002595958237
513
+ - type: euclidean_spearman
514
+ value: 75.35222935003587
515
+ - type: manhattan_pearson
516
+ value: 73.89870445158144
517
+ - type: manhattan_spearman
518
+ value: 75.31714936339398
519
+ - task:
520
+ type: Reranking
521
+ dataset:
522
+ type: C-MTEB/Mmarco-reranking
523
+ name: MTEB MMarcoReranking
524
+ config: default
525
+ split: dev
526
+ revision: None
527
+ metrics:
528
+ - type: map
529
+ value: 31.5372713650176
530
+ - type: mrr
531
+ value: 30.163095238095238
532
+ - task:
533
+ type: Retrieval
534
+ dataset:
535
+ type: C-MTEB/MMarcoRetrieval
536
+ name: MTEB MMarcoRetrieval
537
+ config: default
538
+ split: dev
539
+ revision: None
540
+ metrics:
541
+ - type: map_at_1
542
+ value: 65.054
543
+ - type: map_at_10
544
+ value: 74.156
545
+ - type: map_at_100
546
+ value: 74.523
547
+ - type: map_at_1000
548
+ value: 74.535
549
+ - type: map_at_3
550
+ value: 72.269
551
+ - type: map_at_5
552
+ value: 73.41
553
+ - type: mrr_at_1
554
+ value: 67.24900000000001
555
+ - type: mrr_at_10
556
+ value: 74.78399999999999
557
+ - type: mrr_at_100
558
+ value: 75.107
559
+ - type: mrr_at_1000
560
+ value: 75.117
561
+ - type: mrr_at_3
562
+ value: 73.13499999999999
563
+ - type: mrr_at_5
564
+ value: 74.13499999999999
565
+ - type: ndcg_at_1
566
+ value: 67.24900000000001
567
+ - type: ndcg_at_10
568
+ value: 77.96300000000001
569
+ - type: ndcg_at_100
570
+ value: 79.584
571
+ - type: ndcg_at_1000
572
+ value: 79.884
573
+ - type: ndcg_at_3
574
+ value: 74.342
575
+ - type: ndcg_at_5
576
+ value: 76.278
577
+ - type: precision_at_1
578
+ value: 67.24900000000001
579
+ - type: precision_at_10
580
+ value: 9.466
581
+ - type: precision_at_100
582
+ value: 1.027
583
+ - type: precision_at_1000
584
+ value: 0.105
585
+ - type: precision_at_3
586
+ value: 27.955999999999996
587
+ - type: precision_at_5
588
+ value: 17.817
589
+ - type: recall_at_1
590
+ value: 65.054
591
+ - type: recall_at_10
592
+ value: 89.113
593
+ - type: recall_at_100
594
+ value: 96.369
595
+ - type: recall_at_1000
596
+ value: 98.714
597
+ - type: recall_at_3
598
+ value: 79.45400000000001
599
+ - type: recall_at_5
600
+ value: 84.06
601
+ - task:
602
+ type: Classification
603
+ dataset:
604
+ type: mteb/amazon_massive_intent
605
+ name: MTEB MassiveIntentClassification (zh-CN)
606
+ config: zh-CN
607
+ split: test
608
+ revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
609
+ metrics:
610
+ - type: accuracy
611
+ value: 68.1977135171486
612
+ - type: f1
613
+ value: 67.23114308718404
614
+ - task:
615
+ type: Classification
616
+ dataset:
617
+ type: mteb/amazon_massive_scenario
618
+ name: MTEB MassiveScenarioClassification (zh-CN)
619
+ config: zh-CN
620
+ split: test
621
+ revision: 7d571f92784cd94a019292a1f45445077d0ef634
622
+ metrics:
623
+ - type: accuracy
624
+ value: 71.92669804976462
625
+ - type: f1
626
+ value: 72.90628475628779
627
+ - task:
628
+ type: Retrieval
629
+ dataset:
630
+ type: C-MTEB/MedicalRetrieval
631
+ name: MTEB MedicalRetrieval
632
+ config: default
633
+ split: dev
634
+ revision: None
635
+ metrics:
636
+ - type: map_at_1
637
+ value: 49.2
638
+ - type: map_at_10
639
+ value: 54.539
640
+ - type: map_at_100
641
+ value: 55.135
642
+ - type: map_at_1000
643
+ value: 55.19199999999999
644
+ - type: map_at_3
645
+ value: 53.383
646
+ - type: map_at_5
647
+ value: 54.142999999999994
648
+ - type: mrr_at_1
649
+ value: 49.2
650
+ - type: mrr_at_10
651
+ value: 54.539
652
+ - type: mrr_at_100
653
+ value: 55.135999999999996
654
+ - type: mrr_at_1000
655
+ value: 55.19199999999999
656
+ - type: mrr_at_3
657
+ value: 53.383
658
+ - type: mrr_at_5
659
+ value: 54.142999999999994
660
+ - type: ndcg_at_1
661
+ value: 49.2
662
+ - type: ndcg_at_10
663
+ value: 57.123000000000005
664
+ - type: ndcg_at_100
665
+ value: 60.21300000000001
666
+ - type: ndcg_at_1000
667
+ value: 61.915
668
+ - type: ndcg_at_3
669
+ value: 54.772
670
+ - type: ndcg_at_5
671
+ value: 56.157999999999994
672
+ - type: precision_at_1
673
+ value: 49.2
674
+ - type: precision_at_10
675
+ value: 6.52
676
+ - type: precision_at_100
677
+ value: 0.8009999999999999
678
+ - type: precision_at_1000
679
+ value: 0.094
680
+ - type: precision_at_3
681
+ value: 19.6
682
+ - type: precision_at_5
683
+ value: 12.44
684
+ - type: recall_at_1
685
+ value: 49.2
686
+ - type: recall_at_10
687
+ value: 65.2
688
+ - type: recall_at_100
689
+ value: 80.10000000000001
690
+ - type: recall_at_1000
691
+ value: 93.89999999999999
692
+ - type: recall_at_3
693
+ value: 58.8
694
+ - type: recall_at_5
695
+ value: 62.2
696
+ - task:
697
+ type: Classification
698
+ dataset:
699
+ type: C-MTEB/MultilingualSentiment-classification
700
+ name: MTEB MultilingualSentiment
701
+ config: default
702
+ split: validation
703
+ revision: None
704
+ metrics:
705
+ - type: accuracy
706
+ value: 63.29333333333334
707
+ - type: f1
708
+ value: 63.03293854259612
709
+ - task:
710
+ type: PairClassification
711
+ dataset:
712
+ type: C-MTEB/OCNLI
713
+ name: MTEB Ocnli
714
+ config: default
715
+ split: validation
716
+ revision: None
717
+ metrics:
718
+ - type: cos_sim_accuracy
719
+ value: 75.69030860855442
720
+ - type: cos_sim_ap
721
+ value: 80.6157833772759
722
+ - type: cos_sim_f1
723
+ value: 77.87524366471735
724
+ - type: cos_sim_precision
725
+ value: 72.3076923076923
726
+ - type: cos_sim_recall
727
+ value: 84.37170010559663
728
+ - type: dot_accuracy
729
+ value: 67.78559826746074
730
+ - type: dot_ap
731
+ value: 72.00871467527499
732
+ - type: dot_f1
733
+ value: 72.58722247394654
734
+ - type: dot_precision
735
+ value: 63.57142857142857
736
+ - type: dot_recall
737
+ value: 84.58289334741288
738
+ - type: euclidean_accuracy
739
+ value: 75.20303194369248
740
+ - type: euclidean_ap
741
+ value: 80.98587256415605
742
+ - type: euclidean_f1
743
+ value: 77.26396917148362
744
+ - type: euclidean_precision
745
+ value: 71.03631532329496
746
+ - type: euclidean_recall
747
+ value: 84.68848996832101
748
+ - type: manhattan_accuracy
749
+ value: 75.20303194369248
750
+ - type: manhattan_ap
751
+ value: 80.93460699513219
752
+ - type: manhattan_f1
753
+ value: 77.124773960217
754
+ - type: manhattan_precision
755
+ value: 67.43083003952569
756
+ - type: manhattan_recall
757
+ value: 90.07391763463569
758
+ - type: max_accuracy
759
+ value: 75.69030860855442
760
+ - type: max_ap
761
+ value: 80.98587256415605
762
+ - type: max_f1
763
+ value: 77.87524366471735
764
+ - task:
765
+ type: Classification
766
+ dataset:
767
+ type: C-MTEB/OnlineShopping-classification
768
+ name: MTEB OnlineShopping
769
+ config: default
770
+ split: test
771
+ revision: None
772
+ metrics:
773
+ - type: accuracy
774
+ value: 87.00000000000001
775
+ - type: ap
776
+ value: 83.24372135949511
777
+ - type: f1
778
+ value: 86.95554191530607
779
+ - task:
780
+ type: STS
781
+ dataset:
782
+ type: C-MTEB/PAWSX
783
+ name: MTEB PAWSX
784
+ config: default
785
+ split: test
786
+ revision: None
787
+ metrics:
788
+ - type: cos_sim_pearson
789
+ value: 37.57616811591219
790
+ - type: cos_sim_spearman
791
+ value: 41.490259084930045
792
+ - type: euclidean_pearson
793
+ value: 38.9155043692188
794
+ - type: euclidean_spearman
795
+ value: 39.16056534305623
796
+ - type: manhattan_pearson
797
+ value: 38.76569892264335
798
+ - type: manhattan_spearman
799
+ value: 38.99891685590743
800
+ - task:
801
+ type: STS
802
+ dataset:
803
+ type: C-MTEB/QBQTC
804
+ name: MTEB QBQTC
805
+ config: default
806
+ split: test
807
+ revision: None
808
+ metrics:
809
+ - type: cos_sim_pearson
810
+ value: 35.44858610359665
811
+ - type: cos_sim_spearman
812
+ value: 38.11128146262466
813
+ - type: euclidean_pearson
814
+ value: 31.928644189822457
815
+ - type: euclidean_spearman
816
+ value: 34.384936631696554
817
+ - type: manhattan_pearson
818
+ value: 31.90586687414376
819
+ - type: manhattan_spearman
820
+ value: 34.35770153777186
821
+ - task:
822
+ type: STS
823
+ dataset:
824
+ type: mteb/sts22-crosslingual-sts
825
+ name: MTEB STS22 (zh)
826
+ config: zh
827
+ split: test
828
+ revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
829
+ metrics:
830
+ - type: cos_sim_pearson
831
+ value: 66.54931957553592
832
+ - type: cos_sim_spearman
833
+ value: 69.25068863016632
834
+ - type: euclidean_pearson
835
+ value: 50.26525596106869
836
+ - type: euclidean_spearman
837
+ value: 63.83352741910006
838
+ - type: manhattan_pearson
839
+ value: 49.98798282198196
840
+ - type: manhattan_spearman
841
+ value: 63.87649521907841
842
+ - task:
843
+ type: STS
844
+ dataset:
845
+ type: C-MTEB/STSB
846
+ name: MTEB STSB
847
+ config: default
848
+ split: test
849
+ revision: None
850
+ metrics:
851
+ - type: cos_sim_pearson
852
+ value: 82.52782476625825
853
+ - type: cos_sim_spearman
854
+ value: 82.55618986168398
855
+ - type: euclidean_pearson
856
+ value: 78.48190631687673
857
+ - type: euclidean_spearman
858
+ value: 78.39479731354655
859
+ - type: manhattan_pearson
860
+ value: 78.51176592165885
861
+ - type: manhattan_spearman
862
+ value: 78.42363787303265
863
+ - task:
864
+ type: Reranking
865
+ dataset:
866
+ type: C-MTEB/T2Reranking
867
+ name: MTEB T2Reranking
868
+ config: default
869
+ split: dev
870
+ revision: None
871
+ metrics:
872
+ - type: map
873
+ value: 67.36693873615643
874
+ - type: mrr
875
+ value: 77.83847701797939
876
+ - task:
877
+ type: Retrieval
878
+ dataset:
879
+ type: C-MTEB/T2Retrieval
880
+ name: MTEB T2Retrieval
881
+ config: default
882
+ split: dev
883
+ revision: None
884
+ metrics:
885
+ - type: map_at_1
886
+ value: 25.795
887
+ - type: map_at_10
888
+ value: 72.258
889
+ - type: map_at_100
890
+ value: 76.049
891
+ - type: map_at_1000
892
+ value: 76.134
893
+ - type: map_at_3
894
+ value: 50.697
895
+ - type: map_at_5
896
+ value: 62.324999999999996
897
+ - type: mrr_at_1
898
+ value: 86.634
899
+ - type: mrr_at_10
900
+ value: 89.792
901
+ - type: mrr_at_100
902
+ value: 89.91900000000001
903
+ - type: mrr_at_1000
904
+ value: 89.923
905
+ - type: mrr_at_3
906
+ value: 89.224
907
+ - type: mrr_at_5
908
+ value: 89.608
909
+ - type: ndcg_at_1
910
+ value: 86.634
911
+ - type: ndcg_at_10
912
+ value: 80.589
913
+ - type: ndcg_at_100
914
+ value: 84.812
915
+ - type: ndcg_at_1000
916
+ value: 85.662
917
+ - type: ndcg_at_3
918
+ value: 82.169
919
+ - type: ndcg_at_5
920
+ value: 80.619
921
+ - type: precision_at_1
922
+ value: 86.634
923
+ - type: precision_at_10
924
+ value: 40.389
925
+ - type: precision_at_100
926
+ value: 4.93
927
+ - type: precision_at_1000
928
+ value: 0.513
929
+ - type: precision_at_3
930
+ value: 72.104
931
+ - type: precision_at_5
932
+ value: 60.425
933
+ - type: recall_at_1
934
+ value: 25.795
935
+ - type: recall_at_10
936
+ value: 79.565
937
+ - type: recall_at_100
938
+ value: 93.24799999999999
939
+ - type: recall_at_1000
940
+ value: 97.595
941
+ - type: recall_at_3
942
+ value: 52.583999999999996
943
+ - type: recall_at_5
944
+ value: 66.175
945
+ - task:
946
+ type: Classification
947
+ dataset:
948
+ type: C-MTEB/TNews-classification
949
+ name: MTEB TNews
950
+ config: default
951
+ split: validation
952
+ revision: None
953
+ metrics:
954
+ - type: accuracy
955
+ value: 47.648999999999994
956
+ - type: f1
957
+ value: 46.28925837008413
958
+ - task:
959
+ type: Clustering
960
+ dataset:
961
+ type: C-MTEB/ThuNewsClusteringP2P
962
+ name: MTEB ThuNewsClusteringP2P
963
+ config: default
964
+ split: test
965
+ revision: None
966
+ metrics:
967
+ - type: v_measure
968
+ value: 54.07641891287953
969
+ - task:
970
+ type: Clustering
971
+ dataset:
972
+ type: C-MTEB/ThuNewsClusteringS2S
973
+ name: MTEB ThuNewsClusteringS2S
974
+ config: default
975
+ split: test
976
+ revision: None
977
+ metrics:
978
+ - type: v_measure
979
+ value: 53.423702062353954
980
+ - task:
981
+ type: Retrieval
982
+ dataset:
983
+ type: C-MTEB/VideoRetrieval
984
+ name: MTEB VideoRetrieval
985
+ config: default
986
+ split: dev
987
+ revision: None
988
+ metrics:
989
+ - type: map_at_1
990
+ value: 55.7
991
+ - type: map_at_10
992
+ value: 65.923
993
+ - type: map_at_100
994
+ value: 66.42
995
+ - type: map_at_1000
996
+ value: 66.431
997
+ - type: map_at_3
998
+ value: 63.9
999
+ - type: map_at_5
1000
+ value: 65.225
1001
+ - type: mrr_at_1
1002
+ value: 55.60000000000001
1003
+ - type: mrr_at_10
1004
+ value: 65.873
1005
+ - type: mrr_at_100
1006
+ value: 66.36999999999999
1007
+ - type: mrr_at_1000
1008
+ value: 66.381
1009
+ - type: mrr_at_3
1010
+ value: 63.849999999999994
1011
+ - type: mrr_at_5
1012
+ value: 65.17500000000001
1013
+ - type: ndcg_at_1
1014
+ value: 55.7
1015
+ - type: ndcg_at_10
1016
+ value: 70.621
1017
+ - type: ndcg_at_100
1018
+ value: 72.944
1019
+ - type: ndcg_at_1000
1020
+ value: 73.25399999999999
1021
+ - type: ndcg_at_3
1022
+ value: 66.547
1023
+ - type: ndcg_at_5
1024
+ value: 68.93599999999999
1025
+ - type: precision_at_1
1026
+ value: 55.7
1027
+ - type: precision_at_10
1028
+ value: 8.52
1029
+ - type: precision_at_100
1030
+ value: 0.958
1031
+ - type: precision_at_1000
1032
+ value: 0.098
1033
+ - type: precision_at_3
1034
+ value: 24.733
1035
+ - type: precision_at_5
1036
+ value: 16
1037
+ - type: recall_at_1
1038
+ value: 55.7
1039
+ - type: recall_at_10
1040
+ value: 85.2
1041
+ - type: recall_at_100
1042
+ value: 95.8
1043
+ - type: recall_at_1000
1044
+ value: 98.3
1045
+ - type: recall_at_3
1046
+ value: 74.2
1047
+ - type: recall_at_5
1048
+ value: 80
1049
+ - task:
1050
+ type: Classification
1051
+ dataset:
1052
+ type: C-MTEB/waimai-classification
1053
+ name: MTEB Waimai
1054
+ config: default
1055
+ split: test
1056
+ revision: None
1057
+ metrics:
1058
+ - type: accuracy
1059
+ value: 84.54
1060
+ - type: ap
1061
+ value: 66.13603199670062
1062
+ - type: f1
1063
+ value: 82.61420654584116
1064
+ ---
1065
+ <!-- TODO: add evaluation results here -->
1066
+ <br><br>
1067
+
1068
+ <p align="center">
1069
+ <img src="https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/603763514de52ff951d89793/AFoybzd5lpBQXEBrQHuTt.png?w=200&h=200&f=face" alt="Finetuner logo: Finetuner helps you to create experiments in order to improve embeddings on search tasks. It accompanies you to deliver the last mile of performance-tuning for neural search applications." width="150px">
1070
+ </p>
1071
+
1072
+
1073
+ <p align="center">
1074
+ <b>The text embedding set trained by <a href="https://jina.ai/"><b>Jina AI</b></a>.</b>
1075
+ </p>
1076
+
1077
+ ## Quick Start
1078
+
1079
+ The easiest way to starting using `jina-embeddings-v2-base-zh` is to use Jina AI's [Embedding API](https://jina.ai/embeddings/).
1080
+
1081
+ ## Intended Usage & Model Info
1082
+
1083
+ `jina-embeddings-v2-base-zh` is a Chinese/English bilingual text **embedding model** supporting **8192 sequence length**.
1084
+ It is based on a BERT architecture (JinaBERT) that supports the symmetric bidirectional variant of [ALiBi](https://arxiv.org/abs/2108.12409) to allow longer sequence length.
1085
+ We have designed it for high performance in mono-lingual & cross-lingual applications and trained it specifically to support mixed Chinese-English input without bias.
1086
+ Additionally, we provide the following embedding models:
1087
+
1088
+ `jina-embeddings-v2-base-zh` 是支持中英双语的**文本向量**模型,它支持长达**8192字符**的文本编码。
1089
+ 该模型的研发基于BERT架构(JinaBERT),JinaBERT是在BERT架构基础上的改进,首次将[ALiBi](https://arxiv.org/abs/2108.12409)应用到编码器架构中以支持更长的序列。
1090
+ 不同于以往的单语言/多语言向量模型,我们设计双语模型来更好的支持单语言(中搜中)以及跨语言(中搜英)文档检索。
1091
+ 除此之外,我们也提供其它向量模型:
1092
+
1093
+ - [`jina-embeddings-v2-small-en`](https://huggingface.co/jinaai/jina-embeddings-v2-small-en): 33 million parameters.
1094
+ - [`jina-embeddings-v2-base-en`](https://huggingface.co/jinaai/jina-embeddings-v2-base-en): 137 million parameters.
1095
+ - [`jina-embeddings-v2-base-zh`](https://huggingface.co/jinaai/jina-embeddings-v2-base-zh): 161 million parameters Chinese-English Bilingual embeddings **(you are here)**.
1096
+ - [`jina-embeddings-v2-base-de`](https://huggingface.co/jinaai/jina-embeddings-v2-base-de): 161 million parameters German-English Bilingual embeddings.
1097
+ - [`jina-embeddings-v2-base-es`](): Spanish-English Bilingual embeddings (soon).
1098
+
1099
+ ## Data & Parameters
1100
+
1101
+ We will publish a report with technical details about the training of the bilingual models soon.
1102
+ The training of the English model is described in this [technical report](https://arxiv.org/abs/2310.19923).
1103
+
1104
+ ## Usage
1105
+
1106
+ **<details><summary>Please apply mean pooling when integrating the model.</summary>**
1107
+ <p>
1108
+
1109
+ ### Why mean pooling?
1110
+
1111
+ `mean poooling` takes all token embeddings from model output and averaging them at sentence/paragraph level.
1112
+ It has been proved to be the most effective way to produce high-quality sentence embeddings.
1113
+ We offer an `encode` function to deal with this.
1114
+
1115
+ However, if you would like to do it without using the default `encode` function:
1116
+
1117
+ ```python
1118
+ import torch
1119
+ import torch.nn.functional as F
1120
+ from transformers import AutoTokenizer, AutoModel
1121
+
1122
+ def mean_pooling(model_output, attention_mask):
1123
+ token_embeddings = model_output[0]
1124
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
1125
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
1126
+
1127
+ sentences = ['How is the weather today?', '今天天气怎么样?']
1128
+
1129
+ tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-zh')
1130
+ model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-zh', trust_remote_code=True)
1131
+
1132
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
1133
+
1134
+ with torch.no_grad():
1135
+ model_output = model(**encoded_input)
1136
+
1137
+ embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
1138
+ embeddings = F.normalize(embeddings, p=2, dim=1)
1139
+ ```
1140
+
1141
+ </p>
1142
+ </details>
1143
+
1144
+ You can use Jina Embedding models directly from transformers package.
1145
+
1146
+ First, you need to make sure that you are logged into huggingface. You can either use the huggingface-cli tool (after installing the `transformers` package) and pass your [hugginface access token](https://huggingface.co/docs/hub/security-tokens):
1147
+ ```bash
1148
+ huggingface-cli login
1149
+ ```
1150
+ Alternatively, you can provide the access token as an environment variable in the shell:
1151
+ ```bash
1152
+ export HF_TOKEN="<your token here>"
1153
+ ```
1154
+ or in Python:
1155
+ ```python
1156
+ import os
1157
+
1158
+ os.environ['HF_TOKEN'] = "<your token here>"
1159
+ ```
1160
+
1161
+ Then, you can use load and use the model via the `AutoModel` class:
1162
+ ```python
1163
+ !pip install transformers
1164
+ from transformers import AutoModel
1165
+ from numpy.linalg import norm
1166
+
1167
+ cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
1168
+ model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-zh', trust_remote_code=True) # trust_remote_code is needed to use the encode method
1169
+ embeddings = model.encode(['How is the weather today?', '今天天气怎么样?'])
1170
+ print(cos_sim(embeddings[0], embeddings[1]))
1171
+ ```
1172
+
1173
+ If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
1174
+
1175
+ ```python
1176
+ embeddings = model.encode(
1177
+ ['Very long ... document'],
1178
+ max_length=2048
1179
+ )
1180
+ ```
1181
+
1182
+ If you want to use the model together with the [sentence-transformers package](https://github.com/UKPLab/sentence-transformers/), make sure that you have installed the latest release and set `trust_remote_code=True` as well:
1183
+
1184
+ ```python
1185
+ !pip install -U sentence-transformers
1186
+ from sentence_transformers import SentenceTransformer
1187
+ from numpy.linalg import norm
1188
+
1189
+ cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
1190
+ model = SentenceTransformer('jinaai/jina-embeddings-v2-base-zh', trust_remote_code=True)
1191
+ embeddings = model.encode(['How is the weather today?', '今天天气怎么样?'])
1192
+ print(cos_sim(embeddings[0], embeddings[1]))
1193
+ ```
1194
+
1195
+ Using the its latest release (v2.3.0) sentence-transformers also supports Jina embeddings (Please make sure that you are logged into huggingface as well):
1196
+
1197
+ ```python
1198
+ !pip install -U sentence-transformers
1199
+ from sentence_transformers import SentenceTransformer
1200
+ from sentence_transformers.util import cos_sim
1201
+
1202
+ model = SentenceTransformer(
1203
+ "jinaai/jina-embeddings-v2-base-de", # switch to en/zh for English or Chinese
1204
+ trust_remote_code=True
1205
+ )
1206
+
1207
+ # control your input sequence length up to 8192
1208
+ model.max_seq_length = 1024
1209
+
1210
+ embeddings = model.encode([
1211
+ 'How is the weather today?',
1212
+ 'Wie ist das Wetter heute?'
1213
+ ])
1214
+ print(cos_sim(embeddings[0], embeddings[1]))
1215
+ ```
1216
+
1217
+ ## Alternatives to Using Transformers Package
1218
+
1219
+ 1. _Managed SaaS_: Get started with a free key on Jina AI's [Embedding API](https://jina.ai/embeddings/).
1220
+ 2. _Private and high-performance deployment_: Get started by picking from our suite of models and deploy them on [AWS Sagemaker](https://aws.amazon.com/marketplace/seller-profile?id=seller-stch2ludm6vgy).
1221
+
1222
+ ## Use Jina Embeddings for RAG
1223
+
1224
+ According to the latest blog post from [LLamaIndex](https://blog.llamaindex.ai/boosting-rag-picking-the-best-embedding-reranker-models-42d079022e83),
1225
+
1226
+ > In summary, to achieve the peak performance in both hit rate and MRR, the combination of OpenAI or JinaAI-Base embeddings with the CohereRerank/bge-reranker-large reranker stands out.
1227
+
1228
+ <img src="https://miro.medium.com/v2/resize:fit:4800/format:webp/1*ZP2RVejCZovF3FDCg-Bx3A.png" width="780px">
1229
+
1230
+ ## Trouble Shooting
1231
+
1232
+ **Loading of Model Code failed**
1233
+
1234
+ If you forgot to pass the `trust_remote_code=True` flag when calling `AutoModel.from_pretrained` or initializing the model via the `SentenceTransformer` class, you will receive an error that the model weights could not be initialized.
1235
+ This is caused by tranformers falling back to creating a default BERT model, instead of a jina-embedding model:
1236
+
1237
+ ```bash
1238
+ Some weights of the model checkpoint at jinaai/jina-embeddings-v2-base-en were not used when initializing BertModel: ['encoder.layer.2.mlp.layernorm.weight', 'encoder.layer.3.mlp.layernorm.weight', 'encoder.layer.10.mlp.wo.bias', 'encoder.layer.5.mlp.wo.bias', 'encoder.layer.2.mlp.layernorm.bias', 'encoder.layer.1.mlp.gated_layers.weight', 'encoder.layer.5.mlp.gated_layers.weight', 'encoder.layer.8.mlp.layernorm.bias', ...
1239
+ ```
1240
+
1241
+ **User is not logged into Huggingface**
1242
+
1243
+ The model is only availabe under [gated access](https://huggingface.co/docs/hub/models-gated).
1244
+ This means you need to be logged into huggingface load load it.
1245
+ If you receive the following error, you need to provide an access token, either by using the huggingface-cli or providing the token via an environment variable as described above:
1246
+ ```bash
1247
+ OSError: jinaai/jina-embeddings-v2-base-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
1248
+ If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.
1249
+ ```
1250
+
1251
+ ## Contact
1252
+
1253
+ Join our [Discord community](https://discord.jina.ai) and chat with other community members about ideas.
1254
+
1255
+ ## Citation
1256
+
1257
+ If you find Jina Embeddings useful in your research, please cite the following paper:
1258
+
1259
+ ```
1260
+ @misc{günther2023jina,
1261
+ title={Jina Embeddings 2: 8192-Token General-Purpose Text Embeddings for Long Documents},
1262
+ author={Michael Günther and Jackmin Ong and Isabelle Mohr and Alaeddine Abdessalem and Tanguy Abel and Mohammad Kalim Akram and Susana Guzman and Georgios Mastrapas and Saba Sturua and Bo Wang and Maximilian Werk and Nan Wang and Han Xiao},
1263
+ year={2023},
1264
+ eprint={2310.19923},
1265
+ archivePrefix={arXiv},
1266
+ primaryClass={cs.CL}
1267
+ }
1268
+ ```
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "jinaai/jina-bert-implementation",
3
+ "architectures": [
4
+ "JinaBertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "attn_implementation": "torch",
8
+ "auto_map": {
9
+ "AutoConfig": "jinaai/jina-bert-implementation--configuration_bert.JinaBertConfig",
10
+ "AutoModel": "jinaai/jina-bert-implementation--modeling_bert.JinaBertModel",
11
+ "AutoModelForMaskedLM": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForMaskedLM",
12
+ "AutoModelForQuestionAnswering": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForQuestionAnswering",
13
+ "AutoModelForSequenceClassification": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForSequenceClassification",
14
+ "AutoModelForTokenClassification": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForTokenClassification"
15
+ },
16
+ "classifier_dropout": null,
17
+ "emb_pooler": "mean",
18
+ "feed_forward_type": "geglu",
19
+ "gradient_checkpointing": false,
20
+ "hidden_act": "gelu",
21
+ "hidden_dropout_prob": 0.1,
22
+ "hidden_size": 768,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 3072,
25
+ "layer_norm_eps": 1e-12,
26
+ "max_position_embeddings": 8192,
27
+ "model_type": "bert",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 12,
30
+ "pad_token_id": 0,
31
+ "position_embedding_type": "alibi",
32
+ "torch_dtype": "float16",
33
+ "transformers_version": "4.30.2",
34
+ "type_vocab_size": 2,
35
+ "use_cache": true,
36
+ "vocab_size": 61056
37
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.2.2",
4
+ "transformers": "4.30.2",
5
+ "pytorch": "2.0.1"
6
+ }
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29b7cdda0c8fa9b18f8e0fbc4aba0c9537555fb16b139fa44be92c1e1b3253a8
3
+ size 321648328
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b0e9fa6e5c77cff56e0c9c673ba1aad61e793e592fdd4b05690b68826b7d3a2
3
+ size 641212851
onnx/model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a221ee9e6a6647ccc59cee7bdd26a7b8cf0c0cd3481a65f358d9585a23f02f4
3
+ size 161565239
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a122286d0df6ae67a782cbc7ee1c1cd3c826b00adceb5a5e2efb41ab523b2b03
3
+ size 321664570
sentence_bert_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8192,
3
+ "do_lower_case": false,
4
+ "model_args": {"trust_remote_code": true}
5
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff