infgrad commited on
Commit
a6d4ffa
1 Parent(s): 089fa8b

Upload 7 files

Browse files

upload base model

Files changed (7) hide show
  1. README.md +1246 -1
  2. config.json +35 -0
  3. pytorch_model.bin +3 -0
  4. special_tokens_map.json +7 -0
  5. tokenizer.json +0 -0
  6. tokenizer_config.json +13 -0
  7. vocab.txt +0 -0
README.md CHANGED
@@ -1,3 +1,1248 @@
1
  ---
2
- license: mit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - mteb
4
+ model-index:
5
+ - name: stella-base-zh
6
+ results:
7
+ - task:
8
+ type: STS
9
+ dataset:
10
+ type: C-MTEB/AFQMC
11
+ name: MTEB AFQMC
12
+ config: default
13
+ split: validation
14
+ revision: None
15
+ metrics:
16
+ - type: cos_sim_pearson
17
+ value: 49.34825050234731
18
+ - type: cos_sim_spearman
19
+ value: 51.74726338428475
20
+ - type: euclidean_pearson
21
+ value: 50.14955499038012
22
+ - type: euclidean_spearman
23
+ value: 51.74730359287025
24
+ - type: manhattan_pearson
25
+ value: 50.016703594410615
26
+ - type: manhattan_spearman
27
+ value: 51.63936364317057
28
+ - task:
29
+ type: STS
30
+ dataset:
31
+ type: C-MTEB/ATEC
32
+ name: MTEB ATEC
33
+ config: default
34
+ split: test
35
+ revision: None
36
+ metrics:
37
+ - type: cos_sim_pearson
38
+ value: 52.26876163587667
39
+ - type: cos_sim_spearman
40
+ value: 52.818410137444374
41
+ - type: euclidean_pearson
42
+ value: 55.24925286208574
43
+ - type: euclidean_spearman
44
+ value: 52.818404507964686
45
+ - type: manhattan_pearson
46
+ value: 55.21236977375391
47
+ - type: manhattan_spearman
48
+ value: 52.80289117015117
49
+ - task:
50
+ type: Classification
51
+ dataset:
52
+ type: mteb/amazon_reviews_multi
53
+ name: MTEB AmazonReviewsClassification (zh)
54
+ config: zh
55
+ split: test
56
+ revision: 1399c76144fd37290681b995c656ef9b2e06e26d
57
+ metrics:
58
+ - type: accuracy
59
+ value: 40.245999999999995
60
+ - type: f1
61
+ value: 38.55443674287747
62
+ - task:
63
+ type: STS
64
+ dataset:
65
+ type: C-MTEB/BQ
66
+ name: MTEB BQ
67
+ config: default
68
+ split: test
69
+ revision: None
70
+ metrics:
71
+ - type: cos_sim_pearson
72
+ value: 61.553652835163255
73
+ - type: cos_sim_spearman
74
+ value: 63.29065064027392
75
+ - type: euclidean_pearson
76
+ value: 62.000329557485
77
+ - type: euclidean_spearman
78
+ value: 63.290650638944825
79
+ - type: manhattan_pearson
80
+ value: 62.02786936153664
81
+ - type: manhattan_spearman
82
+ value: 63.32720383880146
83
+ - task:
84
+ type: Clustering
85
+ dataset:
86
+ type: C-MTEB/CLSClusteringP2P
87
+ name: MTEB CLSClusteringP2P
88
+ config: default
89
+ split: test
90
+ revision: None
91
+ metrics:
92
+ - type: v_measure
93
+ value: 39.71224230526474
94
+ - task:
95
+ type: Clustering
96
+ dataset:
97
+ type: C-MTEB/CLSClusteringS2S
98
+ name: MTEB CLSClusteringS2S
99
+ config: default
100
+ split: test
101
+ revision: None
102
+ metrics:
103
+ - type: v_measure
104
+ value: 36.55705201882987
105
+ - task:
106
+ type: Reranking
107
+ dataset:
108
+ type: C-MTEB/CMedQAv1-reranking
109
+ name: MTEB CMedQAv1
110
+ config: default
111
+ split: test
112
+ revision: None
113
+ metrics:
114
+ - type: map
115
+ value: 85.69418720521168
116
+ - type: mrr
117
+ value: 87.97444444444446
118
+ - task:
119
+ type: Reranking
120
+ dataset:
121
+ type: C-MTEB/CMedQAv2-reranking
122
+ name: MTEB CMedQAv2
123
+ config: default
124
+ split: test
125
+ revision: None
126
+ metrics:
127
+ - type: map
128
+ value: 86.46348358482606
129
+ - type: mrr
130
+ value: 88.81428571428572
131
+ - task:
132
+ type: Retrieval
133
+ dataset:
134
+ type: C-MTEB/CmedqaRetrieval
135
+ name: MTEB CmedqaRetrieval
136
+ config: default
137
+ split: dev
138
+ revision: None
139
+ metrics:
140
+ - type: map_at_1
141
+ value: 23.721
142
+ - type: map_at_10
143
+ value: 35.428
144
+ - type: map_at_100
145
+ value: 37.438
146
+ - type: map_at_1000
147
+ value: 37.557
148
+ - type: map_at_3
149
+ value: 31.589
150
+ - type: map_at_5
151
+ value: 33.647
152
+ - type: mrr_at_1
153
+ value: 36.709
154
+ - type: mrr_at_10
155
+ value: 44.590999999999994
156
+ - type: mrr_at_100
157
+ value: 45.684999999999995
158
+ - type: mrr_at_1000
159
+ value: 45.732
160
+ - type: mrr_at_3
161
+ value: 42.331
162
+ - type: mrr_at_5
163
+ value: 43.532
164
+ - type: ndcg_at_1
165
+ value: 36.709
166
+ - type: ndcg_at_10
167
+ value: 41.858000000000004
168
+ - type: ndcg_at_100
169
+ value: 49.775999999999996
170
+ - type: ndcg_at_1000
171
+ value: 51.844
172
+ - type: ndcg_at_3
173
+ value: 37.067
174
+ - type: ndcg_at_5
175
+ value: 38.875
176
+ - type: precision_at_1
177
+ value: 36.709
178
+ - type: precision_at_10
179
+ value: 9.411999999999999
180
+ - type: precision_at_100
181
+ value: 1.5709999999999997
182
+ - type: precision_at_1000
183
+ value: 0.183
184
+ - type: precision_at_3
185
+ value: 21.154999999999998
186
+ - type: precision_at_5
187
+ value: 15.184000000000001
188
+ - type: recall_at_1
189
+ value: 23.721
190
+ - type: recall_at_10
191
+ value: 51.714000000000006
192
+ - type: recall_at_100
193
+ value: 84.60600000000001
194
+ - type: recall_at_1000
195
+ value: 98.414
196
+ - type: recall_at_3
197
+ value: 37.091
198
+ - type: recall_at_5
199
+ value: 42.978
200
+ - task:
201
+ type: PairClassification
202
+ dataset:
203
+ type: C-MTEB/CMNLI
204
+ name: MTEB Cmnli
205
+ config: default
206
+ split: validation
207
+ revision: None
208
+ metrics:
209
+ - type: cos_sim_accuracy
210
+ value: 73.61395069152135
211
+ - type: cos_sim_ap
212
+ value: 81.65459344597652
213
+ - type: cos_sim_f1
214
+ value: 75.66718995290425
215
+ - type: cos_sim_precision
216
+ value: 68.4918529746116
217
+ - type: cos_sim_recall
218
+ value: 84.5218611176058
219
+ - type: dot_accuracy
220
+ value: 73.61395069152135
221
+ - type: dot_ap
222
+ value: 81.64596407363373
223
+ - type: dot_f1
224
+ value: 75.66718995290425
225
+ - type: dot_precision
226
+ value: 68.4918529746116
227
+ - type: dot_recall
228
+ value: 84.5218611176058
229
+ - type: euclidean_accuracy
230
+ value: 73.61395069152135
231
+ - type: euclidean_ap
232
+ value: 81.6546013070452
233
+ - type: euclidean_f1
234
+ value: 75.66718995290425
235
+ - type: euclidean_precision
236
+ value: 68.4918529746116
237
+ - type: euclidean_recall
238
+ value: 84.5218611176058
239
+ - type: manhattan_accuracy
240
+ value: 73.51773902585688
241
+ - type: manhattan_ap
242
+ value: 81.57345451483191
243
+ - type: manhattan_f1
244
+ value: 75.7393958530681
245
+ - type: manhattan_precision
246
+ value: 68.87442572741195
247
+ - type: manhattan_recall
248
+ value: 84.12438625204582
249
+ - type: max_accuracy
250
+ value: 73.61395069152135
251
+ - type: max_ap
252
+ value: 81.6546013070452
253
+ - type: max_f1
254
+ value: 75.7393958530681
255
+ - task:
256
+ type: Retrieval
257
+ dataset:
258
+ type: C-MTEB/CovidRetrieval
259
+ name: MTEB CovidRetrieval
260
+ config: default
261
+ split: dev
262
+ revision: None
263
+ metrics:
264
+ - type: map_at_1
265
+ value: 73.551
266
+ - type: map_at_10
267
+ value: 81.513
268
+ - type: map_at_100
269
+ value: 81.734
270
+ - type: map_at_1000
271
+ value: 81.73700000000001
272
+ - type: map_at_3
273
+ value: 80.27300000000001
274
+ - type: map_at_5
275
+ value: 81.017
276
+ - type: mrr_at_1
277
+ value: 73.762
278
+ - type: mrr_at_10
279
+ value: 81.479
280
+ - type: mrr_at_100
281
+ value: 81.699
282
+ - type: mrr_at_1000
283
+ value: 81.702
284
+ - type: mrr_at_3
285
+ value: 80.33
286
+ - type: mrr_at_5
287
+ value: 80.999
288
+ - type: ndcg_at_1
289
+ value: 73.867
290
+ - type: ndcg_at_10
291
+ value: 84.711
292
+ - type: ndcg_at_100
293
+ value: 85.714
294
+ - type: ndcg_at_1000
295
+ value: 85.803
296
+ - type: ndcg_at_3
297
+ value: 82.244
298
+ - type: ndcg_at_5
299
+ value: 83.514
300
+ - type: precision_at_1
301
+ value: 73.867
302
+ - type: precision_at_10
303
+ value: 9.557
304
+ - type: precision_at_100
305
+ value: 1.001
306
+ - type: precision_at_1000
307
+ value: 0.101
308
+ - type: precision_at_3
309
+ value: 29.505
310
+ - type: precision_at_5
311
+ value: 18.377
312
+ - type: recall_at_1
313
+ value: 73.551
314
+ - type: recall_at_10
315
+ value: 94.521
316
+ - type: recall_at_100
317
+ value: 99.05199999999999
318
+ - type: recall_at_1000
319
+ value: 99.789
320
+ - type: recall_at_3
321
+ value: 87.777
322
+ - type: recall_at_5
323
+ value: 90.83200000000001
324
+ - task:
325
+ type: Retrieval
326
+ dataset:
327
+ type: C-MTEB/DuRetrieval
328
+ name: MTEB DuRetrieval
329
+ config: default
330
+ split: dev
331
+ revision: None
332
+ metrics:
333
+ - type: map_at_1
334
+ value: 26.230999999999998
335
+ - type: map_at_10
336
+ value: 80.635
337
+ - type: map_at_100
338
+ value: 83.393
339
+ - type: map_at_1000
340
+ value: 83.431
341
+ - type: map_at_3
342
+ value: 55.717000000000006
343
+ - type: map_at_5
344
+ value: 70.387
345
+ - type: mrr_at_1
346
+ value: 90.75
347
+ - type: mrr_at_10
348
+ value: 93.569
349
+ - type: mrr_at_100
350
+ value: 93.648
351
+ - type: mrr_at_1000
352
+ value: 93.65
353
+ - type: mrr_at_3
354
+ value: 93.27499999999999
355
+ - type: mrr_at_5
356
+ value: 93.482
357
+ - type: ndcg_at_1
358
+ value: 90.75
359
+ - type: ndcg_at_10
360
+ value: 87.801
361
+ - type: ndcg_at_100
362
+ value: 90.44
363
+ - type: ndcg_at_1000
364
+ value: 90.776
365
+ - type: ndcg_at_3
366
+ value: 86.556
367
+ - type: ndcg_at_5
368
+ value: 85.468
369
+ - type: precision_at_1
370
+ value: 90.75
371
+ - type: precision_at_10
372
+ value: 42.08
373
+ - type: precision_at_100
374
+ value: 4.816
375
+ - type: precision_at_1000
376
+ value: 0.49
377
+ - type: precision_at_3
378
+ value: 77.60000000000001
379
+ - type: precision_at_5
380
+ value: 65.49000000000001
381
+ - type: recall_at_1
382
+ value: 26.230999999999998
383
+ - type: recall_at_10
384
+ value: 89.00200000000001
385
+ - type: recall_at_100
386
+ value: 97.866
387
+ - type: recall_at_1000
388
+ value: 99.569
389
+ - type: recall_at_3
390
+ value: 57.778
391
+ - type: recall_at_5
392
+ value: 74.895
393
+ - task:
394
+ type: Retrieval
395
+ dataset:
396
+ type: C-MTEB/EcomRetrieval
397
+ name: MTEB EcomRetrieval
398
+ config: default
399
+ split: dev
400
+ revision: None
401
+ metrics:
402
+ - type: map_at_1
403
+ value: 47.599999999999994
404
+ - type: map_at_10
405
+ value: 57.296
406
+ - type: map_at_100
407
+ value: 58.011
408
+ - type: map_at_1000
409
+ value: 58.028
410
+ - type: map_at_3
411
+ value: 54.300000000000004
412
+ - type: map_at_5
413
+ value: 56.21000000000001
414
+ - type: mrr_at_1
415
+ value: 47.599999999999994
416
+ - type: mrr_at_10
417
+ value: 57.296
418
+ - type: mrr_at_100
419
+ value: 58.011
420
+ - type: mrr_at_1000
421
+ value: 58.028
422
+ - type: mrr_at_3
423
+ value: 54.300000000000004
424
+ - type: mrr_at_5
425
+ value: 56.21000000000001
426
+ - type: ndcg_at_1
427
+ value: 47.599999999999994
428
+ - type: ndcg_at_10
429
+ value: 62.458000000000006
430
+ - type: ndcg_at_100
431
+ value: 65.589
432
+ - type: ndcg_at_1000
433
+ value: 66.059
434
+ - type: ndcg_at_3
435
+ value: 56.364000000000004
436
+ - type: ndcg_at_5
437
+ value: 59.815
438
+ - type: precision_at_1
439
+ value: 47.599999999999994
440
+ - type: precision_at_10
441
+ value: 7.89
442
+ - type: precision_at_100
443
+ value: 0.928
444
+ - type: precision_at_1000
445
+ value: 0.097
446
+ - type: precision_at_3
447
+ value: 20.767
448
+ - type: precision_at_5
449
+ value: 14.14
450
+ - type: recall_at_1
451
+ value: 47.599999999999994
452
+ - type: recall_at_10
453
+ value: 78.9
454
+ - type: recall_at_100
455
+ value: 92.80000000000001
456
+ - type: recall_at_1000
457
+ value: 96.6
458
+ - type: recall_at_3
459
+ value: 62.3
460
+ - type: recall_at_5
461
+ value: 70.7
462
+ - task:
463
+ type: Classification
464
+ dataset:
465
+ type: C-MTEB/IFlyTek-classification
466
+ name: MTEB IFlyTek
467
+ config: default
468
+ split: validation
469
+ revision: None
470
+ metrics:
471
+ - type: accuracy
472
+ value: 47.46440938822624
473
+ - type: f1
474
+ value: 34.587004997852524
475
+ - task:
476
+ type: Classification
477
+ dataset:
478
+ type: C-MTEB/JDReview-classification
479
+ name: MTEB JDReview
480
+ config: default
481
+ split: test
482
+ revision: None
483
+ metrics:
484
+ - type: accuracy
485
+ value: 84.9906191369606
486
+ - type: ap
487
+ value: 52.31309789960497
488
+ - type: f1
489
+ value: 79.55556102310072
490
+ - task:
491
+ type: STS
492
+ dataset:
493
+ type: C-MTEB/LCQMC
494
+ name: MTEB LCQMC
495
+ config: default
496
+ split: test
497
+ revision: None
498
+ metrics:
499
+ - type: cos_sim_pearson
500
+ value: 69.80872804636063
501
+ - type: cos_sim_spearman
502
+ value: 75.83290476813391
503
+ - type: euclidean_pearson
504
+ value: 74.09865882324753
505
+ - type: euclidean_spearman
506
+ value: 75.83290698376118
507
+ - type: manhattan_pearson
508
+ value: 74.0616102379577
509
+ - type: manhattan_spearman
510
+ value: 75.81278969865738
511
+ - task:
512
+ type: Retrieval
513
+ dataset:
514
+ type: C-MTEB/MMarcoRetrieval
515
+ name: MTEB MMarcoRetrieval
516
+ config: default
517
+ split: dev
518
+ revision: None
519
+ metrics:
520
+ - type: map_at_1
521
+ value: 65.029
522
+ - type: map_at_10
523
+ value: 74.39
524
+ - type: map_at_100
525
+ value: 74.734
526
+ - type: map_at_1000
527
+ value: 74.74300000000001
528
+ - type: map_at_3
529
+ value: 72.52
530
+ - type: map_at_5
531
+ value: 73.724
532
+ - type: mrr_at_1
533
+ value: 67.192
534
+ - type: mrr_at_10
535
+ value: 74.95100000000001
536
+ - type: mrr_at_100
537
+ value: 75.25500000000001
538
+ - type: mrr_at_1000
539
+ value: 75.263
540
+ - type: mrr_at_3
541
+ value: 73.307
542
+ - type: mrr_at_5
543
+ value: 74.355
544
+ - type: ndcg_at_1
545
+ value: 67.192
546
+ - type: ndcg_at_10
547
+ value: 78.22200000000001
548
+ - type: ndcg_at_100
549
+ value: 79.76299999999999
550
+ - type: ndcg_at_1000
551
+ value: 80.018
552
+ - type: ndcg_at_3
553
+ value: 74.656
554
+ - type: ndcg_at_5
555
+ value: 76.697
556
+ - type: precision_at_1
557
+ value: 67.192
558
+ - type: precision_at_10
559
+ value: 9.513
560
+ - type: precision_at_100
561
+ value: 1.027
562
+ - type: precision_at_1000
563
+ value: 0.105
564
+ - type: precision_at_3
565
+ value: 28.204
566
+ - type: precision_at_5
567
+ value: 18.009
568
+ - type: recall_at_1
569
+ value: 65.029
570
+ - type: recall_at_10
571
+ value: 89.462
572
+ - type: recall_at_100
573
+ value: 96.418
574
+ - type: recall_at_1000
575
+ value: 98.409
576
+ - type: recall_at_3
577
+ value: 80.029
578
+ - type: recall_at_5
579
+ value: 84.882
580
+ - task:
581
+ type: Classification
582
+ dataset:
583
+ type: mteb/amazon_massive_intent
584
+ name: MTEB MassiveIntentClassification (zh-CN)
585
+ config: zh-CN
586
+ split: test
587
+ revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
588
+ metrics:
589
+ - type: accuracy
590
+ value: 65.56489576328177
591
+ - type: f1
592
+ value: 63.37174551232159
593
+ - task:
594
+ type: Classification
595
+ dataset:
596
+ type: mteb/amazon_massive_scenario
597
+ name: MTEB MassiveScenarioClassification (zh-CN)
598
+ config: zh-CN
599
+ split: test
600
+ revision: 7d571f92784cd94a019292a1f45445077d0ef634
601
+ metrics:
602
+ - type: accuracy
603
+ value: 71.4862138533961
604
+ - type: f1
605
+ value: 71.171374964826
606
+ - task:
607
+ type: Retrieval
608
+ dataset:
609
+ type: C-MTEB/MedicalRetrieval
610
+ name: MTEB MedicalRetrieval
611
+ config: default
612
+ split: dev
613
+ revision: None
614
+ metrics:
615
+ - type: map_at_1
616
+ value: 48.6
617
+ - type: map_at_10
618
+ value: 54.92700000000001
619
+ - type: map_at_100
620
+ value: 55.528
621
+ - type: map_at_1000
622
+ value: 55.584
623
+ - type: map_at_3
624
+ value: 53.55
625
+ - type: map_at_5
626
+ value: 54.379999999999995
627
+ - type: mrr_at_1
628
+ value: 48.8
629
+ - type: mrr_at_10
630
+ value: 55.028999999999996
631
+ - type: mrr_at_100
632
+ value: 55.629
633
+ - type: mrr_at_1000
634
+ value: 55.684999999999995
635
+ - type: mrr_at_3
636
+ value: 53.65
637
+ - type: mrr_at_5
638
+ value: 54.48
639
+ - type: ndcg_at_1
640
+ value: 48.6
641
+ - type: ndcg_at_10
642
+ value: 57.965999999999994
643
+ - type: ndcg_at_100
644
+ value: 61.043000000000006
645
+ - type: ndcg_at_1000
646
+ value: 62.624
647
+ - type: ndcg_at_3
648
+ value: 55.132000000000005
649
+ - type: ndcg_at_5
650
+ value: 56.621
651
+ - type: precision_at_1
652
+ value: 48.6
653
+ - type: precision_at_10
654
+ value: 6.75
655
+ - type: precision_at_100
656
+ value: 0.823
657
+ - type: precision_at_1000
658
+ value: 0.095
659
+ - type: precision_at_3
660
+ value: 19.900000000000002
661
+ - type: precision_at_5
662
+ value: 12.659999999999998
663
+ - type: recall_at_1
664
+ value: 48.6
665
+ - type: recall_at_10
666
+ value: 67.5
667
+ - type: recall_at_100
668
+ value: 82.3
669
+ - type: recall_at_1000
670
+ value: 94.89999999999999
671
+ - type: recall_at_3
672
+ value: 59.699999999999996
673
+ - type: recall_at_5
674
+ value: 63.3
675
+ - task:
676
+ type: Reranking
677
+ dataset:
678
+ type: C-MTEB/Mmarco-reranking
679
+ name: MTEB MMarcoReranking
680
+ config: default
681
+ split: dev
682
+ revision: None
683
+ metrics:
684
+ - type: map
685
+ value: 29.196130696027474
686
+ - type: mrr
687
+ value: 28.43730158730159
688
+ - task:
689
+ type: Classification
690
+ dataset:
691
+ type: C-MTEB/MultilingualSentiment-classification
692
+ name: MTEB MultilingualSentiment
693
+ config: default
694
+ split: validation
695
+ revision: None
696
+ metrics:
697
+ - type: accuracy
698
+ value: 72.48333333333333
699
+ - type: f1
700
+ value: 72.00258522357558
701
+ - task:
702
+ type: PairClassification
703
+ dataset:
704
+ type: C-MTEB/OCNLI
705
+ name: MTEB Ocnli
706
+ config: default
707
+ split: validation
708
+ revision: None
709
+ metrics:
710
+ - type: cos_sim_accuracy
711
+ value: 65.13264753654575
712
+ - type: cos_sim_ap
713
+ value: 70.52831936800807
714
+ - type: cos_sim_f1
715
+ value: 71.35353535353535
716
+ - type: cos_sim_precision
717
+ value: 57.787958115183244
718
+ - type: cos_sim_recall
719
+ value: 93.24181626187962
720
+ - type: dot_accuracy
721
+ value: 65.13264753654575
722
+ - type: dot_ap
723
+ value: 70.52828597418102
724
+ - type: dot_f1
725
+ value: 71.35353535353535
726
+ - type: dot_precision
727
+ value: 57.787958115183244
728
+ - type: dot_recall
729
+ value: 93.24181626187962
730
+ - type: euclidean_accuracy
731
+ value: 65.13264753654575
732
+ - type: euclidean_ap
733
+ value: 70.52828597418102
734
+ - type: euclidean_f1
735
+ value: 71.35353535353535
736
+ - type: euclidean_precision
737
+ value: 57.787958115183244
738
+ - type: euclidean_recall
739
+ value: 93.24181626187962
740
+ - type: manhattan_accuracy
741
+ value: 64.8077964266378
742
+ - type: manhattan_ap
743
+ value: 70.39954487476643
744
+ - type: manhattan_f1
745
+ value: 71.2270200940573
746
+ - type: manhattan_precision
747
+ value: 59.84195402298851
748
+ - type: manhattan_recall
749
+ value: 87.96198521647307
750
+ - type: max_accuracy
751
+ value: 65.13264753654575
752
+ - type: max_ap
753
+ value: 70.52831936800807
754
+ - type: max_f1
755
+ value: 71.35353535353535
756
+ - task:
757
+ type: Classification
758
+ dataset:
759
+ type: C-MTEB/OnlineShopping-classification
760
+ name: MTEB OnlineShopping
761
+ config: default
762
+ split: test
763
+ revision: None
764
+ metrics:
765
+ - type: accuracy
766
+ value: 90.34
767
+ - type: ap
768
+ value: 87.79622626876444
769
+ - type: f1
770
+ value: 90.32357430051181
771
+ - task:
772
+ type: STS
773
+ dataset:
774
+ type: C-MTEB/PAWSX
775
+ name: MTEB PAWSX
776
+ config: default
777
+ split: test
778
+ revision: None
779
+ metrics:
780
+ - type: cos_sim_pearson
781
+ value: 27.9175458105215
782
+ - type: cos_sim_spearman
783
+ value: 32.024302491613014
784
+ - type: euclidean_pearson
785
+ value: 33.01780461609846
786
+ - type: euclidean_spearman
787
+ value: 32.024301939183374
788
+ - type: manhattan_pearson
789
+ value: 32.94874897942371
790
+ - type: manhattan_spearman
791
+ value: 31.902283210178012
792
+ - task:
793
+ type: STS
794
+ dataset:
795
+ type: C-MTEB/QBQTC
796
+ name: MTEB QBQTC
797
+ config: default
798
+ split: test
799
+ revision: None
800
+ metrics:
801
+ - type: cos_sim_pearson
802
+ value: 36.288219964332754
803
+ - type: cos_sim_spearman
804
+ value: 36.46838652731507
805
+ - type: euclidean_pearson
806
+ value: 35.11414028811812
807
+ - type: euclidean_spearman
808
+ value: 36.468386523814104
809
+ - type: manhattan_pearson
810
+ value: 35.20922826624027
811
+ - type: manhattan_spearman
812
+ value: 36.55349180906185
813
+ - task:
814
+ type: STS
815
+ dataset:
816
+ type: mteb/sts22-crosslingual-sts
817
+ name: MTEB STS22 (zh)
818
+ config: zh
819
+ split: test
820
+ revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
821
+ metrics:
822
+ - type: cos_sim_pearson
823
+ value: 66.18186265837434
824
+ - type: cos_sim_spearman
825
+ value: 67.52365178443915
826
+ - type: euclidean_pearson
827
+ value: 65.46342439169497
828
+ - type: euclidean_spearman
829
+ value: 67.52365178443915
830
+ - type: manhattan_pearson
831
+ value: 67.3476263677961
832
+ - type: manhattan_spearman
833
+ value: 69.09476240936812
834
+ - task:
835
+ type: STS
836
+ dataset:
837
+ type: C-MTEB/STSB
838
+ name: MTEB STSB
839
+ config: default
840
+ split: test
841
+ revision: None
842
+ metrics:
843
+ - type: cos_sim_pearson
844
+ value: 72.53864906415339
845
+ - type: cos_sim_spearman
846
+ value: 72.63037820118355
847
+ - type: euclidean_pearson
848
+ value: 72.42255276991672
849
+ - type: euclidean_spearman
850
+ value: 72.63037820118355
851
+ - type: manhattan_pearson
852
+ value: 72.36324244766192
853
+ - type: manhattan_spearman
854
+ value: 72.58609772740323
855
+ - task:
856
+ type: Reranking
857
+ dataset:
858
+ type: C-MTEB/T2Reranking
859
+ name: MTEB T2Reranking
860
+ config: default
861
+ split: dev
862
+ revision: None
863
+ metrics:
864
+ - type: map
865
+ value: 66.45708148192449
866
+ - type: mrr
867
+ value: 76.08372693469173
868
+ - task:
869
+ type: Retrieval
870
+ dataset:
871
+ type: C-MTEB/T2Retrieval
872
+ name: MTEB T2Retrieval
873
+ config: default
874
+ split: dev
875
+ revision: None
876
+ metrics:
877
+ - type: map_at_1
878
+ value: 26.436999999999998
879
+ - type: map_at_10
880
+ value: 74.516
881
+ - type: map_at_100
882
+ value: 78.29899999999999
883
+ - type: map_at_1000
884
+ value: 78.372
885
+ - type: map_at_3
886
+ value: 52.217
887
+ - type: map_at_5
888
+ value: 64.24
889
+ - type: mrr_at_1
890
+ value: 88.23
891
+ - type: mrr_at_10
892
+ value: 91.06400000000001
893
+ - type: mrr_at_100
894
+ value: 91.18
895
+ - type: mrr_at_1000
896
+ value: 91.184
897
+ - type: mrr_at_3
898
+ value: 90.582
899
+ - type: mrr_at_5
900
+ value: 90.88300000000001
901
+ - type: ndcg_at_1
902
+ value: 88.23
903
+ - type: ndcg_at_10
904
+ value: 82.511
905
+ - type: ndcg_at_100
906
+ value: 86.531
907
+ - type: ndcg_at_1000
908
+ value: 87.244
909
+ - type: ndcg_at_3
910
+ value: 83.987
911
+ - type: ndcg_at_5
912
+ value: 82.46900000000001
913
+ - type: precision_at_1
914
+ value: 88.23
915
+ - type: precision_at_10
916
+ value: 41.245
917
+ - type: precision_at_100
918
+ value: 4.987
919
+ - type: precision_at_1000
920
+ value: 0.515
921
+ - type: precision_at_3
922
+ value: 73.675
923
+ - type: precision_at_5
924
+ value: 61.71
925
+ - type: recall_at_1
926
+ value: 26.436999999999998
927
+ - type: recall_at_10
928
+ value: 81.547
929
+ - type: recall_at_100
930
+ value: 94.548
931
+ - type: recall_at_1000
932
+ value: 98.197
933
+ - type: recall_at_3
934
+ value: 54.056000000000004
935
+ - type: recall_at_5
936
+ value: 67.93
937
+ - task:
938
+ type: Classification
939
+ dataset:
940
+ type: C-MTEB/TNews-classification
941
+ name: MTEB TNews
942
+ config: default
943
+ split: validation
944
+ revision: None
945
+ metrics:
946
+ - type: accuracy
947
+ value: 50.784
948
+ - type: f1
949
+ value: 48.89471168071432
950
+ - task:
951
+ type: Clustering
952
+ dataset:
953
+ type: C-MTEB/ThuNewsClusteringP2P
954
+ name: MTEB ThuNewsClusteringP2P
955
+ config: default
956
+ split: test
957
+ revision: None
958
+ metrics:
959
+ - type: v_measure
960
+ value: 63.19039347990962
961
+ - task:
962
+ type: Clustering
963
+ dataset:
964
+ type: C-MTEB/ThuNewsClusteringS2S
965
+ name: MTEB ThuNewsClusteringS2S
966
+ config: default
967
+ split: test
968
+ revision: None
969
+ metrics:
970
+ - type: v_measure
971
+ value: 55.357378578603225
972
+ - task:
973
+ type: Retrieval
974
+ dataset:
975
+ type: C-MTEB/VideoRetrieval
976
+ name: MTEB VideoRetrieval
977
+ config: default
978
+ split: dev
979
+ revision: None
980
+ metrics:
981
+ - type: map_at_1
982
+ value: 58.8
983
+ - type: map_at_10
984
+ value: 68.623
985
+ - type: map_at_100
986
+ value: 69.074
987
+ - type: map_at_1000
988
+ value: 69.085
989
+ - type: map_at_3
990
+ value: 66.767
991
+ - type: map_at_5
992
+ value: 67.972
993
+ - type: mrr_at_1
994
+ value: 58.699999999999996
995
+ - type: mrr_at_10
996
+ value: 68.573
997
+ - type: mrr_at_100
998
+ value: 69.024
999
+ - type: mrr_at_1000
1000
+ value: 69.035
1001
+ - type: mrr_at_3
1002
+ value: 66.717
1003
+ - type: mrr_at_5
1004
+ value: 67.92200000000001
1005
+ - type: ndcg_at_1
1006
+ value: 58.8
1007
+ - type: ndcg_at_10
1008
+ value: 73.038
1009
+ - type: ndcg_at_100
1010
+ value: 75.16199999999999
1011
+ - type: ndcg_at_1000
1012
+ value: 75.422
1013
+ - type: ndcg_at_3
1014
+ value: 69.297
1015
+ - type: ndcg_at_5
1016
+ value: 71.475
1017
+ - type: precision_at_1
1018
+ value: 58.8
1019
+ - type: precision_at_10
1020
+ value: 8.67
1021
+ - type: precision_at_100
1022
+ value: 0.9650000000000001
1023
+ - type: precision_at_1000
1024
+ value: 0.099
1025
+ - type: precision_at_3
1026
+ value: 25.533
1027
+ - type: precision_at_5
1028
+ value: 16.38
1029
+ - type: recall_at_1
1030
+ value: 58.8
1031
+ - type: recall_at_10
1032
+ value: 86.7
1033
+ - type: recall_at_100
1034
+ value: 96.5
1035
+ - type: recall_at_1000
1036
+ value: 98.5
1037
+ - type: recall_at_3
1038
+ value: 76.6
1039
+ - type: recall_at_5
1040
+ value: 81.89999999999999
1041
+ - task:
1042
+ type: Classification
1043
+ dataset:
1044
+ type: C-MTEB/waimai-classification
1045
+ name: MTEB Waimai
1046
+ config: default
1047
+ split: test
1048
+ revision: None
1049
+ metrics:
1050
+ - type: accuracy
1051
+ value: 86.61999999999999
1052
+ - type: ap
1053
+ value: 69.93149123197975
1054
+ - type: f1
1055
+ value: 84.99670691559903
1056
  ---
1057
+
1058
+ ## stella model
1059
+
1060
+ stella是一个通用的中文文本编码模型,目前有两个版本:base 和 large,**2个版本的模型均支持1024的输入长度**。
1061
+
1062
+ 完整的训练思路和训练过程已记录在[博客](https://zhuanlan.zhihu.com/p/655322183),欢迎阅读讨论。
1063
+
1064
+ **训练数据:**
1065
+
1066
+ 1. 开源数据(wudao_base_200GB[1]、m3e[2]和simclue[3]),着重挑选了长度大于512的文本
1067
+ 2. 在通用语料库上使用LLM构造一批(question, paragraph)和(sentence, paragraph)数据
1068
+
1069
+ **训练方法:**
1070
+
1071
+ 1. 对比学习损失函数
1072
+ 2. 带有难负例的对比学习损失函数(分别基于bm25和vector构造了难负例)
1073
+ 3. EWC(Elastic Weights Consolidation)[4]
1074
+ 4. cosent loss[5]
1075
+ 5. 每一种类型的数据一个迭代器,分别计算loss进行更新
1076
+
1077
+ **初始权重:**\
1078
+ stella-base-zh和stella-large-zh分别以piccolo-base-zh[6]和piccolo-large-zh作为基础模型,512-1024的position embedding使用层次分解位置编码[7]进行初始化。\
1079
+ 感谢商汤科技研究院开源的[piccolo系列模型](https://huggingface.co/sensenova)。
1080
+
1081
+ stella is a general-purpose Chinese text encoding model, currently with two versions: base and large, **both of them
1082
+ support input lengths of 1024.**
1083
+
1084
+ The training data mainly includes:
1085
+
1086
+ 1. Open-source training data (wudao_base_200GB, m3e, and simclue), with a focus on selecting texts with lengths greater
1087
+ than 512.
1088
+ 2. A batch of (question, paragraph) and (sentence, paragraph) data constructed on a general corpus using LLM.
1089
+
1090
+ The loss functions mainly include:
1091
+
1092
+ 1. Contrastive learning loss function
1093
+ 2. Contrastive learning loss function with hard negative examples (based on bm25 and vector hard negatives)
1094
+ 3. EWC (Elastic Weights Consolidation)
1095
+ 4. cosent loss
1096
+
1097
+ Model weight initialization:\
1098
+ stella-base-zh and stella-large-zh use piccolo-base-zh and piccolo-large-zh as the base models, respectively, and the
1099
+ 512-1024 position embedding uses the initialization strategy of hierarchical decomposed position encoding.
1100
+
1101
+ Training strategy:\
1102
+ One iterator for each type of data, separately calculating the loss.
1103
+
1104
+ ## Metric
1105
+
1106
+ #### C-MTEB leaderboard
1107
+
1108
+ stella模型在C-MTEB[8]的结果,评测脚本请参见博客。
1109
+
1110
+ | Model Name | Model Size (GB) | Dimension | Sequence Length | Average (35) | Classification (9) | Clustering (4) | Pair Classification (2) | Reranking (4) | Retrieval (8) | STS (8) |
1111
+ |:------------------------:|:---------------:|:---------:|:---------------:|:------------:|:------------------:|:--------------:|:-----------------------:|:-------------:|:-------------:|:-------:|
1112
+ | **stella-large-zh** | 0.65 | 1024 | **1024** | **64.54** | 67.62 | 48.65 | 78.72 | 65.98 | 71.02 | 58.3 |
1113
+ | **stella-base-zh** | 0.2 | 768 | **1024** | **64.16** | 67.77 | 48.7 | 76.09 | 66.95 | 71.07 | 56.54 |
1114
+ | piccolo-large-zh | 0.65 | 1024 | 512 | 64.11 | 67.03 | 47.04 | 78.38 | 65.98 | 70.93 | 58.02 |
1115
+ | bge-large-zh | 1.3 | 1024 | 512 | 63.96 | 68.32 | 48.39 | 78.94 | 65.11 | 71.52 | 54.98 |
1116
+ | piccolo-base-zh | 0.2 | 768 | 512 | 63.66 | 66.98 | 47.12 | 76.61 | 66.68 | 71.2 | 55.9 |
1117
+ | bge-large-zh-no-instruct | 1.3 | 1024 | 512 | 63.4 | 68.58 | 50.01 | 76.77 | 64.9 | 70.54 | 53 |
1118
+ | [bge-base-zh | 0.41 | 768 | 512 | 62.8 | 67.07 | 47.64 | 77.5 | 64.91 | 69.53 | 54.12 |
1119
+
1120
+ #### Evaluation for long text
1121
+
1122
+ 经过实际观察发现,C-MTEB的评测数据长度基本都是小于512的,
1123
+ 更致命的是那些长度大于512的文本,其重点都在前半部分
1124
+ 这里以CMRC2018的数据为例说明这个问题:
1125
+
1126
+ ```
1127
+ question: 《无双大蛇z》是谁旗下ω-force开发的动作游戏?
1128
+
1129
+ passage:《无双大蛇z》是光荣旗下ω-force开发的动作游戏,于2009年3月12日登陆索尼playstation3,并于2009年11月27日推......
1130
+ ```
1131
+
1132
+ passage长度为800多,大于512,但是对于这个question而言只需要前面40个字就足以检索,��的内容对于模型而言是一种噪声,反而降低了效果。\
1133
+ 简言之,现有数据集的2个问题:\
1134
+ 1)长度大于512的过少\
1135
+ 2)即便大于512,对于检索而言也只需要前512的文本内容\
1136
+ 导致**无法准确评估模型的长文本编码能力。**
1137
+
1138
+ 为了解决这个问题,搜集了相关开源数据并使用规则进行过滤,最终整理了6份长文本测试集,他们分别是:
1139
+
1140
+ - CMRC2018,通用百科
1141
+ - CAIL,法律阅读理解
1142
+ - DRCD,繁体百科,已转简体
1143
+ - Military,军工问答
1144
+ - Squad,英文阅读理解,已转中文
1145
+ - Multifieldqa_zh,清华的大模型长文本理解能力评测数据[9]
1146
+
1147
+ 处理规则是选取答案在512长度之后的文本,短的测试数据会欠采样一下,长短文本占比约为1:2,所以模型既得理解短文本也得理解长文本。
1148
+ 除了Military数据集,我们提供了其他5个测试数据的下载地址:https://drive.google.com/file/d/1WC6EWaCbVgz-vPMDFH4TwAMkLyh5WNcN/view?usp=sharing
1149
+
1150
+ 评测指标为Recall@5, 结果如下:
1151
+
1152
+ | Dataset | piccolo-base-zh | piccolo-large-zh | bge-base-zh | bge-large-zh | stella-base-zh | stella-large-zh |
1153
+ |:---------------:|:---------------:|:----------------:|:-----------:|:------------:|:--------------:|:---------------:|
1154
+ | CMRC2018 | 94.34 | 93.82 | 91.56 | 93.12 | 96.08 | 95.56 |
1155
+ | CAIL | 28.04 | 33.64 | 31.22 | 33.94 | 34.62 | 37.18 |
1156
+ | DRCD | 78.25 | 77.9 | 78.34 | 80.26 | 86.14 | 84.58 |
1157
+ | Military | 76.61 | 73.06 | 75.65 | 75.81 | 83.71 | 80.48 |
1158
+ | Squad | 91.21 | 86.61 | 87.87 | 90.38 | 93.31 | 91.21 |
1159
+ | Multifieldqa_zh | 81.41 | 83.92 | 83.92 | 83.42 | 79.9 | 80.4 |
1160
+ | **Average** | 74.98 | 74.83 | 74.76 | 76.15 | **78.96** | **78.24** |
1161
+
1162
+
1163
+ **注意:** 因为长文本评测数据数量稀少,所以构造时也使用了train部分,如果自行评测,请注意模型的训练数据以免数据泄露。
1164
+
1165
+ ## Usage
1166
+
1167
+ 本模型是在piccolo基础上训练的,因此**用法和piccolo完全一致**。\
1168
+ **注意**:在stella中instruction里的冒号是英文冒号, 即`查询: `和`结果: `。
1169
+
1170
+ 在sentence-transformer库中的使用方法:
1171
+
1172
+ ```python
1173
+ # 对于短对短数据集,下面是通用的使用方式
1174
+ from sentence_transformers import SentenceTransformer
1175
+
1176
+ sentences = ["数据1", "数据2"]
1177
+ model = SentenceTransformer('infgrad/stella-base-zh')
1178
+ print(model.max_seq_length)
1179
+ embeddings_1 = model.encode(sentences, normalize_embeddings=True)
1180
+ embeddings_2 = model.encode(sentences, normalize_embeddings=True)
1181
+ similarity = embeddings_1 @ embeddings_2.T
1182
+ print(similarity)
1183
+ # 如果是短对长数据集,推荐添加instruction,来帮助模型更好地进行检索。
1184
+ # 注意instruction里的是英文的冒号
1185
+ ```
1186
+
1187
+ 直接使用transformers库:
1188
+
1189
+ ```python
1190
+ from transformers import AutoModel, AutoTokenizer
1191
+ from sklearn.preprocessing import normalize
1192
+
1193
+ model = AutoModel.from_pretrained('infgrad/stella-base-zh')
1194
+ tokenizer = AutoTokenizer.from_pretrained('infgrad/stella-base-zh')
1195
+ sentences = ["数据1", "数据ABCDEFGH"]
1196
+ batch_data = tokenizer(
1197
+ batch_text_or_text_pairs=sentences,
1198
+ padding="longest",
1199
+ return_tensors="pt",
1200
+ max_length=1024,
1201
+ truncation=True,
1202
+ )
1203
+ attention_mask = batch_data["attention_mask"]
1204
+ model_output = model(**batch_data)
1205
+ last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
1206
+ vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
1207
+ vectors = normalize(vectors, norm="l2", axis=1, )
1208
+ print(vectors.shape) # 2,768
1209
+ ```
1210
+
1211
+ ## Training Detail
1212
+
1213
+ **硬件:** 单卡A100-80GB
1214
+
1215
+ **环境:** torch1.13.*; transformers-trainer + deepspeed + gradient-checkpointing
1216
+
1217
+ **学习率:** 1e-6
1218
+
1219
+ **batch_size:** base模型为1024,额外增加20%的难负例;large模型为768,额外增加20%的难负例
1220
+
1221
+ **数据量:** 约100万,其中用LLM构造的数据约有200K. LLM模型大小为13b
1222
+
1223
+ ## ToDoList
1224
+
1225
+ **评测的稳定性:**
1226
+ 评测过程中发现Clustering任务会和官方的结果不一致,大约有±0.0x的小差距,基本上可以忽略不计,不影响评测结论。\
1227
+ 但是不完全一样还是比较难理解的,本人试了bge和piccolo系列的模型都存在这个问题,个人猜测可能和使用的库、batch_size等环境有关。
1228
+
1229
+ **更高质量的长文本训练和测试数据:** 训练数据多是用13b模型构造的,肯定会存在噪声。
1230
+ 测试数据基本都是从mrc数据整理来的,所以问题都是factoid类型,不符合真实分布。
1231
+
1232
+ **OOD的性能:** 虽然近期出现了很多向量编码模型,但是对于不是那么通用的domain,这一众模型包括stella、openai和cohere,
1233
+ 它们的效果均比不上BM25。
1234
+
1235
+ ## Reference
1236
+
1237
+ 1. https://www.scidb.cn/en/detail?dataSetId=c6a3fe684227415a9db8e21bac4a15ab
1238
+ 2. https://github.com/wangyuxinwhy/uniem
1239
+ 3. https://github.com/CLUEbenchmark/SimCLUE
1240
+ 4. https://arxiv.org/abs/1612.00796
1241
+ 5. https://kexue.fm/archives/8847
1242
+ 6. https://huggingface.co/sensenova/piccolo-base-zh
1243
+ 7. https://kexue.fm/archives/7947
1244
+ 8. https://github.com/FlagOpen/FlagEmbedding
1245
+ 9. https://github.com/THUDM/LongBench
1246
+
1247
+
1248
+
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/triton-nas/users/tmp_zhangdun/public_model/piccolo-base-zh-1024",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "directionality": "bidi",
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-12,
17
+ "max_position_embeddings": 1024,
18
+ "model_type": "bert",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 0,
23
+ "pooler_fc_size": 768,
24
+ "pooler_num_attention_heads": 12,
25
+ "pooler_num_fc_layers": 3,
26
+ "pooler_size_per_head": 128,
27
+ "pooler_type": "first_token_transform",
28
+ "position_embedding_type": "absolute",
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.30.2",
31
+ "type_vocab_size": 2,
32
+ "uniem_pooling_strategy": "last_mean",
33
+ "use_cache": true,
34
+ "vocab_size": 21128
35
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18bb804df8a017b436f207056465f9b134cbc50175bf611be78aa6b8de837790
3
+ size 205397037
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 1024,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff