infgrad commited on
Commit
c76f532
1 Parent(s): e843b63

add models

Browse files
Files changed (7) hide show
  1. README.md +1320 -1
  2. config.json +34 -0
  3. pytorch_model.bin +3 -0
  4. special_tokens_map.json +7 -0
  5. tokenizer.json +0 -0
  6. tokenizer_config.json +13 -0
  7. vocab.txt +0 -0
README.md CHANGED
@@ -1,3 +1,1322 @@
1
  ---
2
- license: mit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ - mteb
8
+ model-index:
9
+ - name: stella-base-zh-v2
10
+ results:
11
+ - task:
12
+ type: STS
13
+ dataset:
14
+ type: C-MTEB/AFQMC
15
+ name: MTEB AFQMC
16
+ config: default
17
+ split: validation
18
+ revision: None
19
+ metrics:
20
+ - type: cos_sim_pearson
21
+ value: 44.62083443545288
22
+ - type: cos_sim_spearman
23
+ value: 46.72814628391134
24
+ - type: euclidean_pearson
25
+ value: 45.11522093816821
26
+ - type: euclidean_spearman
27
+ value: 46.72818648900957
28
+ - type: manhattan_pearson
29
+ value: 44.98820754682395
30
+ - type: manhattan_spearman
31
+ value: 46.63576705524296
32
+ - task:
33
+ type: STS
34
+ dataset:
35
+ type: C-MTEB/ATEC
36
+ name: MTEB ATEC
37
+ config: default
38
+ split: test
39
+ revision: None
40
+ metrics:
41
+ - type: cos_sim_pearson
42
+ value: 49.543902370260234
43
+ - type: cos_sim_spearman
44
+ value: 51.22161152883018
45
+ - type: euclidean_pearson
46
+ value: 53.49586541060596
47
+ - type: euclidean_spearman
48
+ value: 51.22161490583934
49
+ - type: manhattan_pearson
50
+ value: 53.51023339947787
51
+ - type: manhattan_spearman
52
+ value: 51.22426632538443
53
+ - task:
54
+ type: Classification
55
+ dataset:
56
+ type: mteb/amazon_reviews_multi
57
+ name: MTEB AmazonReviewsClassification (zh)
58
+ config: zh
59
+ split: test
60
+ revision: 1399c76144fd37290681b995c656ef9b2e06e26d
61
+ metrics:
62
+ - type: accuracy
63
+ value: 39.644
64
+ - type: f1
65
+ value: 37.67897186741224
66
+ - task:
67
+ type: STS
68
+ dataset:
69
+ type: C-MTEB/BQ
70
+ name: MTEB BQ
71
+ config: default
72
+ split: test
73
+ revision: None
74
+ metrics:
75
+ - type: cos_sim_pearson
76
+ value: 61.96416237112325
77
+ - type: cos_sim_spearman
78
+ value: 64.80484064041543
79
+ - type: euclidean_pearson
80
+ value: 63.281983537100594
81
+ - type: euclidean_spearman
82
+ value: 64.80483024694405
83
+ - type: manhattan_pearson
84
+ value: 63.266046412399426
85
+ - type: manhattan_spearman
86
+ value: 64.79643672829964
87
+ - task:
88
+ type: Clustering
89
+ dataset:
90
+ type: C-MTEB/CLSClusteringP2P
91
+ name: MTEB CLSClusteringP2P
92
+ config: default
93
+ split: test
94
+ revision: None
95
+ metrics:
96
+ - type: v_measure
97
+ value: 40.25857488823951
98
+ - task:
99
+ type: Clustering
100
+ dataset:
101
+ type: C-MTEB/CLSClusteringS2S
102
+ name: MTEB CLSClusteringS2S
103
+ config: default
104
+ split: test
105
+ revision: None
106
+ metrics:
107
+ - type: v_measure
108
+ value: 37.17501553349549
109
+ - task:
110
+ type: Reranking
111
+ dataset:
112
+ type: C-MTEB/CMedQAv1-reranking
113
+ name: MTEB CMedQAv1
114
+ config: default
115
+ split: test
116
+ revision: None
117
+ metrics:
118
+ - type: map
119
+ value: 84.69751849160603
120
+ - type: mrr
121
+ value: 87.16257936507937
122
+ - task:
123
+ type: Reranking
124
+ dataset:
125
+ type: C-MTEB/CMedQAv2-reranking
126
+ name: MTEB CMedQAv2
127
+ config: default
128
+ split: test
129
+ revision: None
130
+ metrics:
131
+ - type: map
132
+ value: 85.31468551417655
133
+ - type: mrr
134
+ value: 87.74658730158731
135
+ - task:
136
+ type: Retrieval
137
+ dataset:
138
+ type: C-MTEB/CmedqaRetrieval
139
+ name: MTEB CmedqaRetrieval
140
+ config: default
141
+ split: dev
142
+ revision: None
143
+ metrics:
144
+ - type: map_at_1
145
+ value: 24.181
146
+ - type: map_at_10
147
+ value: 35.615
148
+ - type: map_at_100
149
+ value: 37.444
150
+ - type: map_at_1000
151
+ value: 37.573
152
+ - type: map_at_3
153
+ value: 31.679000000000002
154
+ - type: map_at_5
155
+ value: 33.854
156
+ - type: mrr_at_1
157
+ value: 37.108999999999995
158
+ - type: mrr_at_10
159
+ value: 44.653
160
+ - type: mrr_at_100
161
+ value: 45.647
162
+ - type: mrr_at_1000
163
+ value: 45.701
164
+ - type: mrr_at_3
165
+ value: 42.256
166
+ - type: mrr_at_5
167
+ value: 43.497
168
+ - type: ndcg_at_1
169
+ value: 37.108999999999995
170
+ - type: ndcg_at_10
171
+ value: 42.028999999999996
172
+ - type: ndcg_at_100
173
+ value: 49.292
174
+ - type: ndcg_at_1000
175
+ value: 51.64
176
+ - type: ndcg_at_3
177
+ value: 37.017
178
+ - type: ndcg_at_5
179
+ value: 38.997
180
+ - type: precision_at_1
181
+ value: 37.108999999999995
182
+ - type: precision_at_10
183
+ value: 9.386999999999999
184
+ - type: precision_at_100
185
+ value: 1.536
186
+ - type: precision_at_1000
187
+ value: 0.183
188
+ - type: precision_at_3
189
+ value: 20.93
190
+ - type: precision_at_5
191
+ value: 15.268999999999998
192
+ - type: recall_at_1
193
+ value: 24.181
194
+ - type: recall_at_10
195
+ value: 51.961999999999996
196
+ - type: recall_at_100
197
+ value: 82.122
198
+ - type: recall_at_1000
199
+ value: 98.059
200
+ - type: recall_at_3
201
+ value: 36.730000000000004
202
+ - type: recall_at_5
203
+ value: 42.884
204
+ - task:
205
+ type: PairClassification
206
+ dataset:
207
+ type: C-MTEB/CMNLI
208
+ name: MTEB Cmnli
209
+ config: default
210
+ split: validation
211
+ revision: None
212
+ metrics:
213
+ - type: cos_sim_accuracy
214
+ value: 76.23571858087793
215
+ - type: cos_sim_ap
216
+ value: 84.75290046905519
217
+ - type: cos_sim_f1
218
+ value: 77.70114942528735
219
+ - type: cos_sim_precision
220
+ value: 73.05475504322767
221
+ - type: cos_sim_recall
222
+ value: 82.97872340425532
223
+ - type: dot_accuracy
224
+ value: 76.23571858087793
225
+ - type: dot_ap
226
+ value: 84.75113928508674
227
+ - type: dot_f1
228
+ value: 77.70114942528735
229
+ - type: dot_precision
230
+ value: 73.05475504322767
231
+ - type: dot_recall
232
+ value: 82.97872340425532
233
+ - type: euclidean_accuracy
234
+ value: 76.23571858087793
235
+ - type: euclidean_ap
236
+ value: 84.75289931658567
237
+ - type: euclidean_f1
238
+ value: 77.70114942528735
239
+ - type: euclidean_precision
240
+ value: 73.05475504322767
241
+ - type: euclidean_recall
242
+ value: 82.97872340425532
243
+ - type: manhattan_accuracy
244
+ value: 76.17558628983764
245
+ - type: manhattan_ap
246
+ value: 84.75764676597448
247
+ - type: manhattan_f1
248
+ value: 77.73437499999999
249
+ - type: manhattan_precision
250
+ value: 72.52480259161773
251
+ - type: manhattan_recall
252
+ value: 83.75029226093056
253
+ - type: max_accuracy
254
+ value: 76.23571858087793
255
+ - type: max_ap
256
+ value: 84.75764676597448
257
+ - type: max_f1
258
+ value: 77.73437499999999
259
+ - task:
260
+ type: Retrieval
261
+ dataset:
262
+ type: C-MTEB/CovidRetrieval
263
+ name: MTEB CovidRetrieval
264
+ config: default
265
+ split: dev
266
+ revision: None
267
+ metrics:
268
+ - type: map_at_1
269
+ value: 67.43900000000001
270
+ - type: map_at_10
271
+ value: 76.00099999999999
272
+ - type: map_at_100
273
+ value: 76.297
274
+ - type: map_at_1000
275
+ value: 76.29899999999999
276
+ - type: map_at_3
277
+ value: 74.412
278
+ - type: map_at_5
279
+ value: 75.177
280
+ - type: mrr_at_1
281
+ value: 67.65
282
+ - type: mrr_at_10
283
+ value: 76.007
284
+ - type: mrr_at_100
285
+ value: 76.322
286
+ - type: mrr_at_1000
287
+ value: 76.324
288
+ - type: mrr_at_3
289
+ value: 74.464
290
+ - type: mrr_at_5
291
+ value: 75.265
292
+ - type: ndcg_at_1
293
+ value: 67.65
294
+ - type: ndcg_at_10
295
+ value: 79.85600000000001
296
+ - type: ndcg_at_100
297
+ value: 81.34400000000001
298
+ - type: ndcg_at_1000
299
+ value: 81.44200000000001
300
+ - type: ndcg_at_3
301
+ value: 76.576
302
+ - type: ndcg_at_5
303
+ value: 77.956
304
+ - type: precision_at_1
305
+ value: 67.65
306
+ - type: precision_at_10
307
+ value: 9.283
308
+ - type: precision_at_100
309
+ value: 0.9990000000000001
310
+ - type: precision_at_1000
311
+ value: 0.101
312
+ - type: precision_at_3
313
+ value: 27.749000000000002
314
+ - type: precision_at_5
315
+ value: 17.345
316
+ - type: recall_at_1
317
+ value: 67.43900000000001
318
+ - type: recall_at_10
319
+ value: 91.781
320
+ - type: recall_at_100
321
+ value: 98.84100000000001
322
+ - type: recall_at_1000
323
+ value: 99.684
324
+ - type: recall_at_3
325
+ value: 82.719
326
+ - type: recall_at_5
327
+ value: 86.038
328
+ - task:
329
+ type: Retrieval
330
+ dataset:
331
+ type: C-MTEB/DuRetrieval
332
+ name: MTEB DuRetrieval
333
+ config: default
334
+ split: dev
335
+ revision: None
336
+ metrics:
337
+ - type: map_at_1
338
+ value: 25.354
339
+ - type: map_at_10
340
+ value: 79.499
341
+ - type: map_at_100
342
+ value: 82.416
343
+ - type: map_at_1000
344
+ value: 82.451
345
+ - type: map_at_3
346
+ value: 54.664
347
+ - type: map_at_5
348
+ value: 69.378
349
+ - type: mrr_at_1
350
+ value: 89.25
351
+ - type: mrr_at_10
352
+ value: 92.666
353
+ - type: mrr_at_100
354
+ value: 92.738
355
+ - type: mrr_at_1000
356
+ value: 92.74
357
+ - type: mrr_at_3
358
+ value: 92.342
359
+ - type: mrr_at_5
360
+ value: 92.562
361
+ - type: ndcg_at_1
362
+ value: 89.25
363
+ - type: ndcg_at_10
364
+ value: 86.97
365
+ - type: ndcg_at_100
366
+ value: 89.736
367
+ - type: ndcg_at_1000
368
+ value: 90.069
369
+ - type: ndcg_at_3
370
+ value: 85.476
371
+ - type: ndcg_at_5
372
+ value: 84.679
373
+ - type: precision_at_1
374
+ value: 89.25
375
+ - type: precision_at_10
376
+ value: 41.9
377
+ - type: precision_at_100
378
+ value: 4.811
379
+ - type: precision_at_1000
380
+ value: 0.48900000000000005
381
+ - type: precision_at_3
382
+ value: 76.86699999999999
383
+ - type: precision_at_5
384
+ value: 65.25
385
+ - type: recall_at_1
386
+ value: 25.354
387
+ - type: recall_at_10
388
+ value: 88.64999999999999
389
+ - type: recall_at_100
390
+ value: 97.56
391
+ - type: recall_at_1000
392
+ value: 99.37
393
+ - type: recall_at_3
394
+ value: 57.325
395
+ - type: recall_at_5
396
+ value: 74.614
397
+ - task:
398
+ type: Retrieval
399
+ dataset:
400
+ type: C-MTEB/EcomRetrieval
401
+ name: MTEB EcomRetrieval
402
+ config: default
403
+ split: dev
404
+ revision: None
405
+ metrics:
406
+ - type: map_at_1
407
+ value: 48.3
408
+ - type: map_at_10
409
+ value: 57.765
410
+ - type: map_at_100
411
+ value: 58.418000000000006
412
+ - type: map_at_1000
413
+ value: 58.43899999999999
414
+ - type: map_at_3
415
+ value: 54.883
416
+ - type: map_at_5
417
+ value: 56.672999999999995
418
+ - type: mrr_at_1
419
+ value: 48.3
420
+ - type: mrr_at_10
421
+ value: 57.765
422
+ - type: mrr_at_100
423
+ value: 58.418000000000006
424
+ - type: mrr_at_1000
425
+ value: 58.43899999999999
426
+ - type: mrr_at_3
427
+ value: 54.883
428
+ - type: mrr_at_5
429
+ value: 56.672999999999995
430
+ - type: ndcg_at_1
431
+ value: 48.3
432
+ - type: ndcg_at_10
433
+ value: 62.846000000000004
434
+ - type: ndcg_at_100
435
+ value: 65.845
436
+ - type: ndcg_at_1000
437
+ value: 66.369
438
+ - type: ndcg_at_3
439
+ value: 56.996
440
+ - type: ndcg_at_5
441
+ value: 60.214999999999996
442
+ - type: precision_at_1
443
+ value: 48.3
444
+ - type: precision_at_10
445
+ value: 7.9
446
+ - type: precision_at_100
447
+ value: 0.9259999999999999
448
+ - type: precision_at_1000
449
+ value: 0.097
450
+ - type: precision_at_3
451
+ value: 21.032999999999998
452
+ - type: precision_at_5
453
+ value: 14.180000000000001
454
+ - type: recall_at_1
455
+ value: 48.3
456
+ - type: recall_at_10
457
+ value: 79.0
458
+ - type: recall_at_100
459
+ value: 92.60000000000001
460
+ - type: recall_at_1000
461
+ value: 96.7
462
+ - type: recall_at_3
463
+ value: 63.1
464
+ - type: recall_at_5
465
+ value: 70.89999999999999
466
+ - task:
467
+ type: Classification
468
+ dataset:
469
+ type: C-MTEB/IFlyTek-classification
470
+ name: MTEB IFlyTek
471
+ config: default
472
+ split: validation
473
+ revision: None
474
+ metrics:
475
+ - type: accuracy
476
+ value: 47.895344363216616
477
+ - type: f1
478
+ value: 34.95151253165417
479
+ - task:
480
+ type: Classification
481
+ dataset:
482
+ type: C-MTEB/JDReview-classification
483
+ name: MTEB JDReview
484
+ config: default
485
+ split: test
486
+ revision: None
487
+ metrics:
488
+ - type: accuracy
489
+ value: 84.78424015009381
490
+ - type: ap
491
+ value: 52.436279969597685
492
+ - type: f1
493
+ value: 79.49258679392281
494
+ - task:
495
+ type: STS
496
+ dataset:
497
+ type: C-MTEB/LCQMC
498
+ name: MTEB LCQMC
499
+ config: default
500
+ split: test
501
+ revision: None
502
+ metrics:
503
+ - type: cos_sim_pearson
504
+ value: 70.2307617475436
505
+ - type: cos_sim_spearman
506
+ value: 76.88912653700545
507
+ - type: euclidean_pearson
508
+ value: 75.47976675486538
509
+ - type: euclidean_spearman
510
+ value: 76.88912210059333
511
+ - type: manhattan_pearson
512
+ value: 75.45834919257487
513
+ - type: manhattan_spearman
514
+ value: 76.8669208121889
515
+ - task:
516
+ type: Reranking
517
+ dataset:
518
+ type: C-MTEB/Mmarco-reranking
519
+ name: MTEB MMarcoReranking
520
+ config: default
521
+ split: dev
522
+ revision: None
523
+ metrics:
524
+ - type: map
525
+ value: 28.047948482579244
526
+ - type: mrr
527
+ value: 26.63809523809524
528
+ - task:
529
+ type: Retrieval
530
+ dataset:
531
+ type: C-MTEB/MMarcoRetrieval
532
+ name: MTEB MMarcoRetrieval
533
+ config: default
534
+ split: dev
535
+ revision: None
536
+ metrics:
537
+ - type: map_at_1
538
+ value: 65.837
539
+ - type: map_at_10
540
+ value: 74.72
541
+ - type: map_at_100
542
+ value: 75.068
543
+ - type: map_at_1000
544
+ value: 75.079
545
+ - type: map_at_3
546
+ value: 72.832
547
+ - type: map_at_5
548
+ value: 74.07000000000001
549
+ - type: mrr_at_1
550
+ value: 68.009
551
+ - type: mrr_at_10
552
+ value: 75.29400000000001
553
+ - type: mrr_at_100
554
+ value: 75.607
555
+ - type: mrr_at_1000
556
+ value: 75.617
557
+ - type: mrr_at_3
558
+ value: 73.677
559
+ - type: mrr_at_5
560
+ value: 74.74199999999999
561
+ - type: ndcg_at_1
562
+ value: 68.009
563
+ - type: ndcg_at_10
564
+ value: 78.36
565
+ - type: ndcg_at_100
566
+ value: 79.911
567
+ - type: ndcg_at_1000
568
+ value: 80.226
569
+ - type: ndcg_at_3
570
+ value: 74.825
571
+ - type: ndcg_at_5
572
+ value: 76.9
573
+ - type: precision_at_1
574
+ value: 68.009
575
+ - type: precision_at_10
576
+ value: 9.463000000000001
577
+ - type: precision_at_100
578
+ value: 1.023
579
+ - type: precision_at_1000
580
+ value: 0.105
581
+ - type: precision_at_3
582
+ value: 28.075
583
+ - type: precision_at_5
584
+ value: 17.951
585
+ - type: recall_at_1
586
+ value: 65.837
587
+ - type: recall_at_10
588
+ value: 89.00099999999999
589
+ - type: recall_at_100
590
+ value: 95.968
591
+ - type: recall_at_1000
592
+ value: 98.461
593
+ - type: recall_at_3
594
+ value: 79.69800000000001
595
+ - type: recall_at_5
596
+ value: 84.623
597
+ - task:
598
+ type: Classification
599
+ dataset:
600
+ type: mteb/amazon_massive_intent
601
+ name: MTEB MassiveIntentClassification (zh-CN)
602
+ config: zh-CN
603
+ split: test
604
+ revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
605
+ metrics:
606
+ - type: accuracy
607
+ value: 68.08675184936112
608
+ - type: f1
609
+ value: 65.51466585063827
610
+ - task:
611
+ type: Classification
612
+ dataset:
613
+ type: mteb/amazon_massive_scenario
614
+ name: MTEB MassiveScenarioClassification (zh-CN)
615
+ config: zh-CN
616
+ split: test
617
+ revision: 7d571f92784cd94a019292a1f45445077d0ef634
618
+ metrics:
619
+ - type: accuracy
620
+ value: 73.22461331540013
621
+ - type: f1
622
+ value: 72.675432030145
623
+ - task:
624
+ type: Retrieval
625
+ dataset:
626
+ type: C-MTEB/MedicalRetrieval
627
+ name: MTEB MedicalRetrieval
628
+ config: default
629
+ split: dev
630
+ revision: None
631
+ metrics:
632
+ - type: map_at_1
633
+ value: 49.2
634
+ - type: map_at_10
635
+ value: 55.394
636
+ - type: map_at_100
637
+ value: 55.883
638
+ - type: map_at_1000
639
+ value: 55.93900000000001
640
+ - type: map_at_3
641
+ value: 53.733
642
+ - type: map_at_5
643
+ value: 54.778000000000006
644
+ - type: mrr_at_1
645
+ value: 49.3
646
+ - type: mrr_at_10
647
+ value: 55.444
648
+ - type: mrr_at_100
649
+ value: 55.933
650
+ - type: mrr_at_1000
651
+ value: 55.989
652
+ - type: mrr_at_3
653
+ value: 53.783
654
+ - type: mrr_at_5
655
+ value: 54.827999999999996
656
+ - type: ndcg_at_1
657
+ value: 49.2
658
+ - type: ndcg_at_10
659
+ value: 58.501999999999995
660
+ - type: ndcg_at_100
661
+ value: 61.181
662
+ - type: ndcg_at_1000
663
+ value: 62.848000000000006
664
+ - type: ndcg_at_3
665
+ value: 55.143
666
+ - type: ndcg_at_5
667
+ value: 57.032000000000004
668
+ - type: precision_at_1
669
+ value: 49.2
670
+ - type: precision_at_10
671
+ value: 6.83
672
+ - type: precision_at_100
673
+ value: 0.815
674
+ - type: precision_at_1000
675
+ value: 0.095
676
+ - type: precision_at_3
677
+ value: 19.733
678
+ - type: precision_at_5
679
+ value: 12.76
680
+ - type: recall_at_1
681
+ value: 49.2
682
+ - type: recall_at_10
683
+ value: 68.30000000000001
684
+ - type: recall_at_100
685
+ value: 81.5
686
+ - type: recall_at_1000
687
+ value: 95.0
688
+ - type: recall_at_3
689
+ value: 59.199999999999996
690
+ - type: recall_at_5
691
+ value: 63.800000000000004
692
+ - task:
693
+ type: Classification
694
+ dataset:
695
+ type: C-MTEB/MultilingualSentiment-classification
696
+ name: MTEB MultilingualSentiment
697
+ config: default
698
+ split: validation
699
+ revision: None
700
+ metrics:
701
+ - type: accuracy
702
+ value: 71.66666666666666
703
+ - type: f1
704
+ value: 70.92944632461379
705
+ - task:
706
+ type: PairClassification
707
+ dataset:
708
+ type: C-MTEB/OCNLI
709
+ name: MTEB Ocnli
710
+ config: default
711
+ split: validation
712
+ revision: None
713
+ metrics:
714
+ - type: cos_sim_accuracy
715
+ value: 70.00541418516514
716
+ - type: cos_sim_ap
717
+ value: 75.16499510773514
718
+ - type: cos_sim_f1
719
+ value: 73.09435517099301
720
+ - type: cos_sim_precision
721
+ value: 59.932432432432435
722
+ - type: cos_sim_recall
723
+ value: 93.66420274551214
724
+ - type: dot_accuracy
725
+ value: 70.00541418516514
726
+ - type: dot_ap
727
+ value: 75.16499510773514
728
+ - type: dot_f1
729
+ value: 73.09435517099301
730
+ - type: dot_precision
731
+ value: 59.932432432432435
732
+ - type: dot_recall
733
+ value: 93.66420274551214
734
+ - type: euclidean_accuracy
735
+ value: 70.00541418516514
736
+ - type: euclidean_ap
737
+ value: 75.16499510773514
738
+ - type: euclidean_f1
739
+ value: 73.09435517099301
740
+ - type: euclidean_precision
741
+ value: 59.932432432432435
742
+ - type: euclidean_recall
743
+ value: 93.66420274551214
744
+ - type: manhattan_accuracy
745
+ value: 70.11369788846778
746
+ - type: manhattan_ap
747
+ value: 75.1259071890593
748
+ - type: manhattan_f1
749
+ value: 72.91399229781771
750
+ - type: manhattan_precision
751
+ value: 61.294964028776974
752
+ - type: manhattan_recall
753
+ value: 89.96832101372756
754
+ - type: max_accuracy
755
+ value: 70.11369788846778
756
+ - type: max_ap
757
+ value: 75.16499510773514
758
+ - type: max_f1
759
+ value: 73.09435517099301
760
+ - task:
761
+ type: Classification
762
+ dataset:
763
+ type: C-MTEB/OnlineShopping-classification
764
+ name: MTEB OnlineShopping
765
+ config: default
766
+ split: test
767
+ revision: None
768
+ metrics:
769
+ - type: accuracy
770
+ value: 91.38000000000002
771
+ - type: ap
772
+ value: 89.12250244489272
773
+ - type: f1
774
+ value: 91.36604511107015
775
+ - task:
776
+ type: STS
777
+ dataset:
778
+ type: C-MTEB/PAWSX
779
+ name: MTEB PAWSX
780
+ config: default
781
+ split: test
782
+ revision: None
783
+ metrics:
784
+ - type: cos_sim_pearson
785
+ value: 24.231255568030463
786
+ - type: cos_sim_spearman
787
+ value: 29.6964906904186
788
+ - type: euclidean_pearson
789
+ value: 30.166130502867016
790
+ - type: euclidean_spearman
791
+ value: 29.69614167804371
792
+ - type: manhattan_pearson
793
+ value: 30.166606116745935
794
+ - type: manhattan_spearman
795
+ value: 29.62681453661945
796
+ - task:
797
+ type: STS
798
+ dataset:
799
+ type: C-MTEB/QBQTC
800
+ name: MTEB QBQTC
801
+ config: default
802
+ split: test
803
+ revision: None
804
+ metrics:
805
+ - type: cos_sim_pearson
806
+ value: 34.88835755574809
807
+ - type: cos_sim_spearman
808
+ value: 37.3797926051053
809
+ - type: euclidean_pearson
810
+ value: 35.46629492698549
811
+ - type: euclidean_spearman
812
+ value: 37.37987510604593
813
+ - type: manhattan_pearson
814
+ value: 35.4953353526957
815
+ - type: manhattan_spearman
816
+ value: 37.41397231689605
817
+ - task:
818
+ type: STS
819
+ dataset:
820
+ type: mteb/sts22-crosslingual-sts
821
+ name: MTEB STS22 (zh)
822
+ config: zh
823
+ split: test
824
+ revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
825
+ metrics:
826
+ - type: cos_sim_pearson
827
+ value: 67.79575721136626
828
+ - type: cos_sim_spearman
829
+ value: 69.02068400784196
830
+ - type: euclidean_pearson
831
+ value: 68.30675023447176
832
+ - type: euclidean_spearman
833
+ value: 69.02068400784196
834
+ - type: manhattan_pearson
835
+ value: 69.91284259797827
836
+ - type: manhattan_spearman
837
+ value: 70.31717787763641
838
+ - task:
839
+ type: STS
840
+ dataset:
841
+ type: C-MTEB/STSB
842
+ name: MTEB STSB
843
+ config: default
844
+ split: test
845
+ revision: None
846
+ metrics:
847
+ - type: cos_sim_pearson
848
+ value: 79.05026785034129
849
+ - type: cos_sim_spearman
850
+ value: 79.62719014756249
851
+ - type: euclidean_pearson
852
+ value: 79.13305301290063
853
+ - type: euclidean_spearman
854
+ value: 79.62710682651051
855
+ - type: manhattan_pearson
856
+ value: 79.07012559140433
857
+ - type: manhattan_spearman
858
+ value: 79.58333069893605
859
+ - task:
860
+ type: Reranking
861
+ dataset:
862
+ type: C-MTEB/T2Reranking
863
+ name: MTEB T2Reranking
864
+ config: default
865
+ split: dev
866
+ revision: None
867
+ metrics:
868
+ - type: map
869
+ value: 66.34533369244325
870
+ - type: mrr
871
+ value: 75.93632792769557
872
+ - task:
873
+ type: Retrieval
874
+ dataset:
875
+ type: C-MTEB/T2Retrieval
876
+ name: MTEB T2Retrieval
877
+ config: default
878
+ split: dev
879
+ revision: None
880
+ metrics:
881
+ - type: map_at_1
882
+ value: 26.995
883
+ - type: map_at_10
884
+ value: 76.083
885
+ - type: map_at_100
886
+ value: 79.727
887
+ - type: map_at_1000
888
+ value: 79.798
889
+ - type: map_at_3
890
+ value: 53.455
891
+ - type: map_at_5
892
+ value: 65.747
893
+ - type: mrr_at_1
894
+ value: 89.536
895
+ - type: mrr_at_10
896
+ value: 91.972
897
+ - type: mrr_at_100
898
+ value: 92.07
899
+ - type: mrr_at_1000
900
+ value: 92.07499999999999
901
+ - type: mrr_at_3
902
+ value: 91.52900000000001
903
+ - type: mrr_at_5
904
+ value: 91.806
905
+ - type: ndcg_at_1
906
+ value: 89.536
907
+ - type: ndcg_at_10
908
+ value: 83.756
909
+ - type: ndcg_at_100
910
+ value: 87.468
911
+ - type: ndcg_at_1000
912
+ value: 88.16199999999999
913
+ - type: ndcg_at_3
914
+ value: 85.349
915
+ - type: ndcg_at_5
916
+ value: 83.855
917
+ - type: precision_at_1
918
+ value: 89.536
919
+ - type: precision_at_10
920
+ value: 41.713
921
+ - type: precision_at_100
922
+ value: 4.994
923
+ - type: precision_at_1000
924
+ value: 0.515
925
+ - type: precision_at_3
926
+ value: 74.81400000000001
927
+ - type: precision_at_5
928
+ value: 62.678
929
+ - type: recall_at_1
930
+ value: 26.995
931
+ - type: recall_at_10
932
+ value: 82.586
933
+ - type: recall_at_100
934
+ value: 94.726
935
+ - type: recall_at_1000
936
+ value: 98.276
937
+ - type: recall_at_3
938
+ value: 55.106
939
+ - type: recall_at_5
940
+ value: 69.096
941
+ - task:
942
+ type: Classification
943
+ dataset:
944
+ type: C-MTEB/TNews-classification
945
+ name: MTEB TNews
946
+ config: default
947
+ split: validation
948
+ revision: None
949
+ metrics:
950
+ - type: accuracy
951
+ value: 51.25200000000001
952
+ - type: f1
953
+ value: 49.43760438233612
954
+ - task:
955
+ type: Clustering
956
+ dataset:
957
+ type: C-MTEB/ThuNewsClusteringP2P
958
+ name: MTEB ThuNewsClusteringP2P
959
+ config: default
960
+ split: test
961
+ revision: None
962
+ metrics:
963
+ - type: v_measure
964
+ value: 62.18575394560257
965
+ - task:
966
+ type: Clustering
967
+ dataset:
968
+ type: C-MTEB/ThuNewsClusteringS2S
969
+ name: MTEB ThuNewsClusteringS2S
970
+ config: default
971
+ split: test
972
+ revision: None
973
+ metrics:
974
+ - type: v_measure
975
+ value: 57.97489103903411
976
+ - task:
977
+ type: Retrieval
978
+ dataset:
979
+ type: C-MTEB/VideoRetrieval
980
+ name: MTEB VideoRetrieval
981
+ config: default
982
+ split: dev
983
+ revision: None
984
+ metrics:
985
+ - type: map_at_1
986
+ value: 52.2
987
+ - type: map_at_10
988
+ value: 63.23800000000001
989
+ - type: map_at_100
990
+ value: 63.788
991
+ - type: map_at_1000
992
+ value: 63.800999999999995
993
+ - type: map_at_3
994
+ value: 61.016999999999996
995
+ - type: map_at_5
996
+ value: 62.392
997
+ - type: mrr_at_1
998
+ value: 52.2
999
+ - type: mrr_at_10
1000
+ value: 63.23800000000001
1001
+ - type: mrr_at_100
1002
+ value: 63.788
1003
+ - type: mrr_at_1000
1004
+ value: 63.800999999999995
1005
+ - type: mrr_at_3
1006
+ value: 61.016999999999996
1007
+ - type: mrr_at_5
1008
+ value: 62.392
1009
+ - type: ndcg_at_1
1010
+ value: 52.2
1011
+ - type: ndcg_at_10
1012
+ value: 68.273
1013
+ - type: ndcg_at_100
1014
+ value: 70.892
1015
+ - type: ndcg_at_1000
1016
+ value: 71.207
1017
+ - type: ndcg_at_3
1018
+ value: 63.794
1019
+ - type: ndcg_at_5
1020
+ value: 66.268
1021
+ - type: precision_at_1
1022
+ value: 52.2
1023
+ - type: precision_at_10
1024
+ value: 8.39
1025
+ - type: precision_at_100
1026
+ value: 0.96
1027
+ - type: precision_at_1000
1028
+ value: 0.098
1029
+ - type: precision_at_3
1030
+ value: 23.933
1031
+ - type: precision_at_5
1032
+ value: 15.559999999999999
1033
+ - type: recall_at_1
1034
+ value: 52.2
1035
+ - type: recall_at_10
1036
+ value: 83.89999999999999
1037
+ - type: recall_at_100
1038
+ value: 96.0
1039
+ - type: recall_at_1000
1040
+ value: 98.4
1041
+ - type: recall_at_3
1042
+ value: 71.8
1043
+ - type: recall_at_5
1044
+ value: 77.8
1045
+ - task:
1046
+ type: Classification
1047
+ dataset:
1048
+ type: C-MTEB/waimai-classification
1049
+ name: MTEB Waimai
1050
+ config: default
1051
+ split: test
1052
+ revision: None
1053
+ metrics:
1054
+ - type: accuracy
1055
+ value: 86.67999999999999
1056
+ - type: ap
1057
+ value: 69.96366657730151
1058
+ - type: f1
1059
+ value: 84.92349905611292
1060
  ---
1061
+
1062
+ ## stella model
1063
+
1064
+ **新闻 | News**
1065
+
1066
+ **[2023-10-12]** 开源stella-base-zh-v2和stella-large-zh-v2, 效果更好且使用简单,**不需要任何前缀文本**。
1067
+ Release stella-base-zh-v2 and stella-large-zh-v2. The 2 models have better performance
1068
+ and **do not need any prefix text**.\
1069
+ **[2023-09-11]** 开源stella-base-zh和stella-large-zh
1070
+
1071
+ stella是一个通用的文本编码模型,主要有以下模型:
1072
+
1073
+ | Model Name | Model Size (GB) | Dimension | Sequence Length | Language | Need instruction for retrieval? |
1074
+ |:------------------:|:---------------:|:---------:|:---------------:|:--------:|:-------------------------------:|
1075
+ | stella-large-zh-v2 | 0.65 | 1024 | 1024 | Chinese | No |
1076
+ | stella-base-zh-v2 | 0.2 | 768 | 1024 | Chinese | No |
1077
+ | stella-large-zh | 0.65 | 1024 | 1024 | Chinese | Yes |
1078
+ | stella-base-zh | 0.2 | 768 | 1024 | Chinese | Yes |
1079
+
1080
+ 完整的训练思路和训练过程已记录在[博客](https://zhuanlan.zhihu.com/p/655322183),欢迎阅读讨论。
1081
+
1082
+ **训练数据:**
1083
+
1084
+ 1. 开源数据(wudao_base_200GB[1]、m3e[2]和simclue[3]),着重挑选了长度大于512的文本
1085
+ 2. 在通用语料库上使用LLM构造一批(question, paragraph)和(sentence, paragraph)数据
1086
+
1087
+ **训练方法:**
1088
+
1089
+ 1. 对比学习损失函数
1090
+ 2. 带有难负例的对比学习损失函数(分别基于bm25和vector构造了难负例)
1091
+ 3. EWC(Elastic Weights Consolidation)[4]
1092
+ 4. cosent loss[5]
1093
+ 5. 每一种类型的数据一个迭代器,分别计算loss进行更新
1094
+
1095
+ stella-v2在stella模型的基础上,使用了更多的训练数据,同时知识蒸馏等方法去除了前置的instruction(
1096
+ 比如piccolo的`查询:`, `结果:`, e5的`query:`和`passage:`)。
1097
+
1098
+ **初始权重:**\
1099
+ stella-base-zh和stella-large-zh分别以piccolo-base-zh[6]和piccolo-large-zh作为基础模型,512-1024的position
1100
+ embedding使用层次分解位置编码[7]进行初始化。\
1101
+ 感谢商汤科技研究院开源的[piccolo系列模型](https://huggingface.co/sensenova)。
1102
+
1103
+ stella is a general-purpose text encoder, which mainly includes the following models:
1104
+
1105
+ | Model Name | Model Size (GB) | Dimension | Sequence Length | Language | Need instruction for retrieval? |
1106
+ |:------------------:|:---------------:|:---------:|:---------------:|:--------:|:-------------------------------:|
1107
+ | stella-large-zh-v2 | 0.65 | 1024 | 1024 | Chinese | No |
1108
+ | stella-base-zh-v2 | 0.2 | 768 | 1024 | Chinese | No |
1109
+ | stella-large-zh | 0.65 | 1024 | 1024 | Chinese | Yes |
1110
+ | stella-base-zh | 0.2 | 768 | 1024 | Chinese | Yes |
1111
+
1112
+ The training data mainly includes:
1113
+
1114
+ 1. Open-source training data (wudao_base_200GB, m3e, and simclue), with a focus on selecting texts with lengths greater
1115
+ than 512.
1116
+ 2. A batch of (question, paragraph) and (sentence, paragraph) data constructed on a general corpus using LLM.
1117
+
1118
+ The loss functions mainly include:
1119
+
1120
+ 1. Contrastive learning loss function
1121
+ 2. Contrastive learning loss function with hard negative examples (based on bm25 and vector hard negatives)
1122
+ 3. EWC (Elastic Weights Consolidation)
1123
+ 4. cosent loss
1124
+
1125
+ Model weight initialization:\
1126
+ stella-base-zh and stella-large-zh use piccolo-base-zh and piccolo-large-zh as the base models, respectively, and the
1127
+ 512-1024 position embedding uses the initialization strategy of hierarchical decomposed position encoding.
1128
+
1129
+ Training strategy:\
1130
+ One iterator for each type of data, separately calculating the loss.
1131
+
1132
+ Based on stella models, stella-v2 use more training data and remove instruction by Knowledge Distillation.
1133
+
1134
+ ## Metric
1135
+
1136
+ #### C-MTEB leaderboard (Chinese)
1137
+
1138
+ | Model Name | Model Size (GB) | Dimension | Sequence Length | Average (35) | Classification (9) | Clustering (4) | Pair Classification (2) | Reranking (4) | Retrieval (8) | STS (8) |
1139
+ |:------------------:|:---------------:|:---------:|:---------------:|:------------:|:------------------:|:--------------:|:-----------------------:|:-------------:|:-------------:|:-------:|
1140
+ | stella-large-zh-v2 | 0.65 | 1024 | 1024 | 65.13 | 69.05 | 49.16 | 82.68 | 66.41 | 70.14 | 58.66 |
1141
+ | stella-base-zh-v2 | 0.2 | 768 | 1024 | 64.36 | 68.29 | 49.4 | 79.95 | 66.1 | 70.08 | 56.92 |
1142
+ | stella-large-zh | 0.65 | 1024 | 1024 | 64.54 | 67.62 | 48.65 | 78.72 | 65.98 | 71.02 | 58.3 |
1143
+ | stella-base-zh | 0.2 | 768 | 1024 | 64.16 | 67.77 | 48.7 | 76.09 | 66.95 | 71.07 | 56.54 |
1144
+
1145
+ #### Reproduce our results
1146
+
1147
+ Codes:
1148
+
1149
+ ```python
1150
+ import torch
1151
+ import numpy as np
1152
+ from typing import List
1153
+ from mteb import MTEB
1154
+ from sentence_transformers import SentenceTransformer
1155
+
1156
+
1157
+ class FastTextEncoder():
1158
+ def __init__(self, model_name):
1159
+ self.model = SentenceTransformer(model_name).cuda().half().eval()
1160
+ self.model.max_seq_length = 512
1161
+
1162
+ def encode(
1163
+ self,
1164
+ input_texts: List[str],
1165
+ *args,
1166
+ **kwargs
1167
+ ):
1168
+ new_sens = list(set(input_texts))
1169
+ new_sens.sort(key=lambda x: len(x), reverse=True)
1170
+ vecs = self.model.encode(
1171
+ new_sens, normalize_embeddings=True, convert_to_numpy=True, batch_size=256
1172
+ ).astype(np.float32)
1173
+ sen2arrid = {sen: idx for idx, sen in enumerate(new_sens)}
1174
+ vecs = vecs[[sen2arrid[sen] for sen in input_texts]]
1175
+ torch.cuda.empty_cache()
1176
+ return vecs
1177
+
1178
+
1179
+ if __name__ == '__main__':
1180
+ model_name = "infgrad/stella-base-zh-v2"
1181
+ output_folder = "zh_mteb_results/stella-base-zh-v2"
1182
+ task_names = [t.description["name"] for t in MTEB(task_langs=['zh', 'zh-CN']).tasks]
1183
+ model = FastTextEncoder(model_name)
1184
+ for task in task_names:
1185
+ MTEB(tasks=[task], task_langs=['zh', 'zh-CN']).run(model, output_folder=output_folder)
1186
+
1187
+ ```
1188
+
1189
+ #### Evaluation for long text
1190
+
1191
+ 经过实际观察发现,C-MTEB的评测数据长度基本都是小于512的,
1192
+ 更致命的是那些长度大于512的文本,其重点都在前半部分
1193
+ 这里以CMRC2018的数据为例说明这个问题:
1194
+
1195
+ ```
1196
+ question: 《无双大蛇z》是谁旗下ω-force开发的动作游戏?
1197
+
1198
+ passage:《无双大蛇z》是光荣旗下ω-force开发的动作游戏,于2009年3月12日登陆索尼playstation3,并于2009年11月27日推......
1199
+ ```
1200
+
1201
+ passage长度为800多,大于512,但是对于这个question而言只需要前面40个字就足以检索,多的内容对于模型而言是一种噪声,反而降低了效果。\
1202
+ 简言之,现有数据集的2个问题:\
1203
+ 1)长度大于512的过少\
1204
+ 2)即便大于512,对于检索而言也只需要前512的文本内容\
1205
+ 导致**无法准确评估模型的长文本编码能力。**
1206
+
1207
+ 为了解决这个问题,搜集了相关开源数据并使用规则进行过滤,最终整理了6份长文本测试集,他们分别是:
1208
+
1209
+ - CMRC2018,通用百科
1210
+ - CAIL,法律阅读理解
1211
+ - DRCD,繁体百科,已转简体
1212
+ - Military,军工问答
1213
+ - Squad,英文阅读理解,已转中文
1214
+ - Multifieldqa_zh,清华的大模型长文本理解能力评测数据[9]
1215
+
1216
+ 处理规则是选取答案在512长度之后的文本,短的测试数据会欠采样一下,长短文本占比约为1:2,所以模型既得理解短文本也得理解长文本。
1217
+ 除了Military数据集,我们提供了其他5个测试数据的下载地址:https://drive.google.com/file/d/1WC6EWaCbVgz-vPMDFH4TwAMkLyh5WNcN/view?usp=sharing
1218
+
1219
+ 评测指标为Recall@5, 结果如下:
1220
+
1221
+ | Dataset | piccolo-base-zh | piccolo-large-zh | bge-base-zh | bge-large-zh | stella-base-zh | stella-large-zh |
1222
+ |:---------------:|:---------------:|:----------------:|:-----------:|:------------:|:--------------:|:---------------:|
1223
+ | CMRC2018 | 94.34 | 93.82 | 91.56 | 93.12 | 96.08 | 95.56 |
1224
+ | CAIL | 28.04 | 33.64 | 31.22 | 33.94 | 34.62 | 37.18 |
1225
+ | DRCD | 78.25 | 77.9 | 78.34 | 80.26 | 86.14 | 84.58 |
1226
+ | Military | 76.61 | 73.06 | 75.65 | 75.81 | 83.71 | 80.48 |
1227
+ | Squad | 91.21 | 86.61 | 87.87 | 90.38 | 93.31 | 91.21 |
1228
+ | Multifieldqa_zh | 81.41 | 83.92 | 83.92 | 83.42 | 79.9 | 80.4 |
1229
+ | **Average** | 74.98 | 74.83 | 74.76 | 76.15 | **78.96** | **78.24** |
1230
+
1231
+ **注意:** 因为长文本评测数据数量稀少,所以构造时也使用了train部分,如果自行评测,请注意模型的训练数据以免数据泄露。
1232
+
1233
+ ## Usage
1234
+
1235
+ #### stella 中文系列模型
1236
+
1237
+ stella-base-zh 和 stella-large-zh: 本模型是在piccolo基础上训练的,因此**用法和piccolo完全一致**
1238
+ ,即在检索重排任务上给query和passage加上`查询: `和`结果: `。对于短短匹配不需要做任何操作。
1239
+
1240
+ stella-base-zh-v2 和 stella-large-zh-v2: 本模型使用简单,**任何使用场景中都不需要加前缀文本**。
1241
+
1242
+ stella中文系列模型均使用mean pooling做为文本向量。
1243
+
1244
+ 在sentence-transformer库中的使用方法:
1245
+
1246
+ ```python
1247
+ # 对于短对短数据集,下面是通用的使用方式
1248
+ from sentence_transformers import SentenceTransformer
1249
+
1250
+ sentences = ["数据1", "数据2"]
1251
+ model = SentenceTransformer('infgrad/stella-base-zh-v2')
1252
+ print(model.max_seq_length)
1253
+ embeddings_1 = model.encode(sentences, normalize_embeddings=True)
1254
+ embeddings_2 = model.encode(sentences, normalize_embeddings=True)
1255
+ similarity = embeddings_1 @ embeddings_2.T
1256
+ print(similarity)
1257
+ ```
1258
+
1259
+ 直接使用transformers库:
1260
+
1261
+ ```python
1262
+ from transformers import AutoModel, AutoTokenizer
1263
+ from sklearn.preprocessing import normalize
1264
+
1265
+ model = AutoModel.from_pretrained('infgrad/stella-base-zh-v2')
1266
+ tokenizer = AutoTokenizer.from_pretrained('infgrad/stella-base-zh-v2')
1267
+ sentences = ["数据1", "数据ABCDEFGH"]
1268
+ batch_data = tokenizer(
1269
+ batch_text_or_text_pairs=sentences,
1270
+ padding="longest",
1271
+ return_tensors="pt",
1272
+ max_length=1024,
1273
+ truncation=True,
1274
+ )
1275
+ attention_mask = batch_data["attention_mask"]
1276
+ model_output = model(**batch_data)
1277
+ last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
1278
+ vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
1279
+ vectors = normalize(vectors, norm="l2", axis=1, )
1280
+ print(vectors.shape) # 2,768
1281
+ ```
1282
+
1283
+ #### stella models for English
1284
+
1285
+ developing...
1286
+
1287
+ ## Training Detail
1288
+
1289
+ **硬件:** 单卡A100-80GB
1290
+
1291
+ **环境:** torch1.13.*; transformers-trainer + deepspeed + gradient-checkpointing
1292
+
1293
+ **学习率:** 1e-6
1294
+
1295
+ **batch_size:** base模型为1024,额外增加20%的难负例;large模型为768,额外增加20%的难负例
1296
+
1297
+ **数据量:** 第一版模型约100万,其中用LLM构造的数据约有200K. LLM模型大小为13b。v2系列模型到了2000万训练数据。
1298
+
1299
+ ## ToDoList
1300
+
1301
+ **评测的稳定性:**
1302
+ 评测过程中发现Clustering任务会和官方的结果不一致,大约有±0.0x的小差距,原因是聚类代码没有设置random_seed,差距可以忽略不计,不影响评测结论。
1303
+
1304
+ **更高质量的长文本训练和测试数据:** 训练数据多是用13b模型构造的,肯定会存在噪声。
1305
+ 测试数据基本都是从mrc数据整理来的,所以问题都是factoid类型,不符合真实分布。
1306
+
1307
+ **OOD的性能:** 虽然近期出现了很多向量编码模型,但是对于不是那么通用的domain,这一众模型包括stella、openai和cohere,
1308
+ 它们的效果均比不上BM25。
1309
+
1310
+ ## Reference
1311
+
1312
+ 1. https://www.scidb.cn/en/detail?dataSetId=c6a3fe684227415a9db8e21bac4a15ab
1313
+ 2. https://github.com/wangyuxinwhy/uniem
1314
+ 3. https://github.com/CLUEbenchmark/SimCLUE
1315
+ 4. https://arxiv.org/abs/1612.00796
1316
+ 5. https://kexue.fm/archives/8847
1317
+ 6. https://huggingface.co/sensenova/piccolo-base-zh
1318
+ 7. https://kexue.fm/archives/7947
1319
+ 8. https://github.com/FlagOpen/FlagEmbedding
1320
+ 9. https://github.com/THUDM/LongBench
1321
+
1322
+
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-12,
16
+ "max_position_embeddings": 1024,
17
+ "model_type": "bert",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "output_past": true,
21
+ "pad_token_id": 0,
22
+ "pooler_fc_size": 768,
23
+ "pooler_num_attention_heads": 12,
24
+ "pooler_num_fc_layers": 3,
25
+ "pooler_size_per_head": 128,
26
+ "pooler_type": "first_token_transform",
27
+ "position_embedding_type": "absolute",
28
+ "torch_dtype": "float16",
29
+ "transformers_version": "4.30.2",
30
+ "type_vocab_size": 2,
31
+ "uniem_pooling_strategy": "last_mean",
32
+ "use_cache": true,
33
+ "vocab_size": 21128
34
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdcd3c81b1712c88199abdd259c995ce1e088457d4472c4041b7bf60badcc18c
3
+ size 205397037
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 1024,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff