File size: 10,844 Bytes
2d40503
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
 
 
 
 
 
 
 
 
 
6e72a58
2d40503
6e72a58
2d40503
6e72a58
2d40503
6e72a58
7ed166f
 
 
cac188d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
---
tags:
- mteb
model-index:
- name: student
  results:
  - task:
      type: STS
    dataset:
      type: C-MTEB/AFQMC
      name: MTEB AFQMC
      config: default
      split: validation
      revision: None
    metrics:
    - type: cos_sim_pearson
      value: 42.01013972878128
    - type: cos_sim_spearman
      value: 43.4493974759166
    - type: euclidean_pearson
      value: 41.9332741602486
    - type: euclidean_spearman
      value: 43.4565546063627
    - type: manhattan_pearson
      value: 41.9297043571561
    - type: manhattan_spearman
      value: 43.44509515848548
  - task:
      type: STS
    dataset:
      type: C-MTEB/ATEC
      name: MTEB ATEC
      config: default
      split: test
      revision: None
    metrics:
    - type: cos_sim_pearson
      value: 47.48357848831134
    - type: cos_sim_spearman
      value: 48.0096502737997
  - task:
      type: STS
    dataset:
      type: mteb/biosses-sts
      name: MTEB BIOSSES
      config: default
      split: test
      revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
    metrics:
    - type: cos_sim_pearson
      value: 70.06631340065852
    - type: cos_sim_spearman
      value: 70.56425845690775
  - task:
      type: STS
    dataset:
      type: C-MTEB/BQ
      name: MTEB BQ
      config: default
      split: test
      revision: None
    metrics:
    - type: cos_sim_pearson
      value: 63.30619967351764
    - type: cos_sim_spearman
      value: 65.57791727146774
    - type: euclidean_pearson
      value: 64.41653053459552
    - type: euclidean_spearman
      value: 65.60244311139472
    - type: manhattan_pearson
      value: 64.37518298990945
    - type: manhattan_spearman
      value: 65.56983205786409
  - task:
      type: BitextMining
    dataset:
      type: mteb/bucc-bitext-mining
      name: MTEB BUCC (zh-en)
      config: zh-en
      split: test
      revision: d51519689f32196a32af33b075a01d0e7c51e252
    metrics:
    - type: accuracy
      value: 98.42022116903634
    - type: f1
      value: 98.38511497279269
    - type: precision
      value: 98.36756187467088
    - type: recall
      value: 98.42022116903634
  - task:
      type: STS
    dataset:
      type: C-MTEB/LCQMC
      name: MTEB LCQMC
      config: default
      split: test
      revision: None
    metrics:
    - type: cos_sim_pearson
      value: 71.3095132213625
    - type: cos_sim_spearman
      value: 75.55615792829865
    - type: euclidean_pearson
      value: 74.37147909656647
    - type: euclidean_spearman
      value: 75.54784459711308
    - type: manhattan_pearson
      value: 74.29759624788565
    - type: manhattan_spearman
      value: 75.49037321257157
  - task:
      type: STS
    dataset:
      type: C-MTEB/PAWSX
      name: MTEB PAWSX
      config: default
      split: test
      revision: None
    metrics:
    - type: cos_sim_pearson
      value: 42.821882144591406
    - type: cos_sim_spearman
      value: 47.616725737501724
    - type: euclidean_pearson
      value: 46.991556480777675
    - type: euclidean_spearman
      value: 47.624128831089685
    - type: manhattan_pearson
      value: 46.83451589707148
    - type: manhattan_spearman
      value: 47.47345373932411
  - task:
      type: STS
    dataset:
      type: C-MTEB/QBQTC
      name: MTEB QBQTC
      config: default
      split: test
      revision: None
    metrics:
    - type: cos_sim_pearson
      value: 39.48274306266568
    - type: cos_sim_spearman
      value: 40.43254828668596
    - type: euclidean_pearson
      value: 39.121198397707374
    - type: euclidean_spearman
      value: 40.47848829374869
    - type: manhattan_pearson
      value: 39.07044184765326
    - type: manhattan_spearman
      value: 40.41344728276686
  - task:
      type: STS
    dataset:
      type: mteb/sickr-sts
      name: MTEB SICK-R
      config: default
      split: test
      revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
    metrics:
    - type: cos_sim_pearson
      value: 81.60488630930521
    - type: cos_sim_spearman
      value: 79.04311658059933
    - type: euclidean_pearson
      value: 78.95158745413384
    - type: euclidean_spearman
      value: 78.99206332696008
    - type: manhattan_pearson
      value: 78.93956396383128
    - type: manhattan_spearman
      value: 78.94138617747835
  - task:
      type: STS
    dataset:
      type: mteb/sts12-sts
      name: MTEB STS12
      config: default
      split: test
      revision: a0d554a64d88156834ff5ae9920b964011b16384
    metrics:
    - type: cos_sim_pearson
      value: 85.50516203958485
    - type: cos_sim_spearman
      value: 78.39314964894021
    - type: euclidean_pearson
      value: 83.03876157406377
    - type: euclidean_spearman
      value: 78.43128279495177
    - type: manhattan_pearson
      value: 83.00734833664097
    - type: manhattan_spearman
      value: 78.33755694741544
  - task:
      type: STS
    dataset:
      type: mteb/sts13-sts
      name: MTEB STS13
      config: default
      split: test
      revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
    metrics:
    - type: cos_sim_pearson
      value: 82.52249245791886
    - type: cos_sim_spearman
      value: 83.71503684399218
    - type: euclidean_pearson
      value: 82.83033355582003
    - type: euclidean_spearman
      value: 83.6956570069731
    - type: manhattan_pearson
      value: 82.74415910929217
    - type: manhattan_spearman
      value: 83.58167243171766
  - task:
      type: STS
    dataset:
      type: mteb/sts14-sts
      name: MTEB STS14
      config: default
      split: test
      revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
    metrics:
    - type: cos_sim_pearson
      value: 81.00915974657362
    - type: cos_sim_spearman
      value: 79.19276300509559
    - type: euclidean_pearson
      value: 80.17657754340593
    - type: euclidean_spearman
      value: 79.19425018312683
    - type: manhattan_pearson
      value: 80.04321829436775
    - type: manhattan_spearman
      value: 79.0458687679498
  - task:
      type: STS
    dataset:
      type: mteb/sts15-sts
      name: MTEB STS15
      config: default
      split: test
      revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
    metrics:
    - type: cos_sim_pearson
      value: 84.99452083625762
    - type: cos_sim_spearman
      value: 85.57952966879047
    - type: euclidean_pearson
      value: 85.14932626009531
    - type: euclidean_spearman
      value: 85.59697259700918
    - type: manhattan_pearson
      value: 85.11214415799934
    - type: manhattan_spearman
      value: 85.54871088485925
  - task:
      type: STS
    dataset:
      type: mteb/sts16-sts
      name: MTEB STS16
      config: default
      split: test
      revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
    metrics:
    - type: cos_sim_pearson
      value: 80.33170312674788
    - type: cos_sim_spearman
      value: 82.3316942254394
    - type: euclidean_pearson
      value: 82.00948134099386
    - type: euclidean_spearman
      value: 82.32475375375705
    - type: manhattan_pearson
      value: 81.94953036676401
    - type: manhattan_spearman
      value: 82.26329177825353
  - task:
      type: STS
    dataset:
      type: mteb/sts17-crosslingual-sts
      name: MTEB STS17 (en-en)
      config: en-en
      split: test
      revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
    metrics:
    - type: cos_sim_pearson
      value: 87.60426458021554
    - type: cos_sim_spearman
      value: 87.89776827373123
    - type: euclidean_pearson
      value: 88.19401282603557
    - type: euclidean_spearman
      value: 87.90080500648473
    - type: manhattan_pearson
      value: 88.39099772653003
    - type: manhattan_spearman
      value: 88.03019288557621
  - task:
      type: STS
    dataset:
      type: mteb/sts22-crosslingual-sts
      name: MTEB STS22 (en)
      config: en
      split: test
      revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
    metrics:
    - type: cos_sim_pearson
      value: 60.38925903960008
    - type: cos_sim_spearman
      value: 63.91952542589123
  - task:
      type: STS
    dataset:
      type: mteb/sts22-crosslingual-sts
      name: MTEB STS22 (zh)
      config: zh
      split: test
      revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
    metrics:
    - type: cos_sim_pearson
      value: 61.51076949065575
    - type: cos_sim_spearman
      value: 67.24427398434739
  - task:
      type: STS
    dataset:
      type: mteb/sts22-crosslingual-sts
      name: MTEB STS22 (zh-en)
      config: zh-en
      split: test
      revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
    metrics:
    - type: cos_sim_pearson
      value: 70.08946142653247
    - type: cos_sim_spearman
      value: 70.01280058113731
  - task:
      type: STS
    dataset:
      type: C-MTEB/STSB
      name: MTEB STSB
      config: default
      split: test
      revision: None
    metrics:
    - type: cos_sim_pearson
      value: 75.52896222293855
    - type: cos_sim_spearman
      value: 75.38140772041567
  - task:
      type: STS
    dataset:
      type: mteb/stsbenchmark-sts
      name: MTEB STSBenchmark
      config: default
      split: test
      revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
    metrics:
    - type: cos_sim_pearson
      value: 85.09649790270096
    - type: cos_sim_spearman
      value: 85.99053080606336
    - type: euclidean_pearson
      value: 85.9554143396231
    - type: euclidean_spearman
      value: 85.9826211701156
    - type: manhattan_pearson
      value: 85.91951912635923
    - type: manhattan_spearman
      value: 85.90751385480418
  - task:
      type: BitextMining
    dataset:
      type: mteb/tatoeba-bitext-mining
      name: MTEB Tatoeba (cmn-eng)
      config: cmn-eng
      split: test
      revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553
    metrics:
    - type: accuracy
      value: 96.3
    - type: f1
      value: 95.15
    - type: precision
      value: 94.58333333333333
    - type: recall
      value: 96.3
---


Use Chinese and English STS and NLI corpora to conduct contrastive learning finetuning on xlmr



## Using HuggingFace Transformers

```
from transformers import AutoTokenizer, AutoModel
import torch
# Sentences we want sentence embeddings for
sentences = ["样例数据-1", "样例数据-2"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('zhou-xl/bi-cse')
model = AutoModel.from_pretrained('zhou-xl/bi-cse')
model.eval()

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings)
```