|
--- |
|
tags: |
|
- mteb |
|
- sentence-similarity |
|
- sentence-transformers |
|
- Sentence Transformers |
|
model-index: |
|
- name: gte-small-zh |
|
results: |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/AFQMC |
|
name: MTEB AFQMC |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 35.80906032378281 |
|
- type: cos_sim_spearman |
|
value: 36.688967176174415 |
|
- type: euclidean_pearson |
|
value: 35.70701955438158 |
|
- type: euclidean_spearman |
|
value: 36.6889470691436 |
|
- type: manhattan_pearson |
|
value: 35.832741768286944 |
|
- type: manhattan_spearman |
|
value: 36.831888591957195 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/ATEC |
|
name: MTEB ATEC |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 44.667266488330384 |
|
- type: cos_sim_spearman |
|
value: 45.77390794946174 |
|
- type: euclidean_pearson |
|
value: 48.14272832901943 |
|
- type: euclidean_spearman |
|
value: 45.77390569666109 |
|
- type: manhattan_pearson |
|
value: 48.187667158563094 |
|
- type: manhattan_spearman |
|
value: 45.80979161966117 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: mteb/amazon_reviews_multi |
|
name: MTEB AmazonReviewsClassification (zh) |
|
config: zh |
|
split: test |
|
revision: 1399c76144fd37290681b995c656ef9b2e06e26d |
|
metrics: |
|
- type: accuracy |
|
value: 38.690000000000005 |
|
- type: f1 |
|
value: 36.868257131984016 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/BQ |
|
name: MTEB BQ |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 49.03674224607541 |
|
- type: cos_sim_spearman |
|
value: 49.63568854885055 |
|
- type: euclidean_pearson |
|
value: 49.47441886441355 |
|
- type: euclidean_spearman |
|
value: 49.63567815431205 |
|
- type: manhattan_pearson |
|
value: 49.76480072909559 |
|
- type: manhattan_spearman |
|
value: 49.977789367288224 |
|
- task: |
|
type: Clustering |
|
dataset: |
|
type: C-MTEB/CLSClusteringP2P |
|
name: MTEB CLSClusteringP2P |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: v_measure |
|
value: 39.538126779019755 |
|
- task: |
|
type: Clustering |
|
dataset: |
|
type: C-MTEB/CLSClusteringS2S |
|
name: MTEB CLSClusteringS2S |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: v_measure |
|
value: 37.333105487031766 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/CMedQAv1-reranking |
|
name: MTEB CMedQAv1 |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 86.08142426347963 |
|
- type: mrr |
|
value: 88.04269841269841 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/CMedQAv2-reranking |
|
name: MTEB CMedQAv2 |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 87.25694119382474 |
|
- type: mrr |
|
value: 89.36853174603175 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/CmedqaRetrieval |
|
name: MTEB CmedqaRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 23.913999999999998 |
|
- type: map_at_10 |
|
value: 35.913000000000004 |
|
- type: map_at_100 |
|
value: 37.836 |
|
- type: map_at_1000 |
|
value: 37.952000000000005 |
|
- type: map_at_3 |
|
value: 31.845000000000002 |
|
- type: map_at_5 |
|
value: 34.0 |
|
- type: mrr_at_1 |
|
value: 36.884 |
|
- type: mrr_at_10 |
|
value: 44.872 |
|
- type: mrr_at_100 |
|
value: 45.899 |
|
- type: mrr_at_1000 |
|
value: 45.945 |
|
- type: mrr_at_3 |
|
value: 42.331 |
|
- type: mrr_at_5 |
|
value: 43.674 |
|
- type: ndcg_at_1 |
|
value: 36.884 |
|
- type: ndcg_at_10 |
|
value: 42.459 |
|
- type: ndcg_at_100 |
|
value: 50.046 |
|
- type: ndcg_at_1000 |
|
value: 52.092000000000006 |
|
- type: ndcg_at_3 |
|
value: 37.225 |
|
- type: ndcg_at_5 |
|
value: 39.2 |
|
- type: precision_at_1 |
|
value: 36.884 |
|
- type: precision_at_10 |
|
value: 9.562 |
|
- type: precision_at_100 |
|
value: 1.572 |
|
- type: precision_at_1000 |
|
value: 0.183 |
|
- type: precision_at_3 |
|
value: 21.122 |
|
- type: precision_at_5 |
|
value: 15.274 |
|
- type: recall_at_1 |
|
value: 23.913999999999998 |
|
- type: recall_at_10 |
|
value: 52.891999999999996 |
|
- type: recall_at_100 |
|
value: 84.328 |
|
- type: recall_at_1000 |
|
value: 98.168 |
|
- type: recall_at_3 |
|
value: 37.095 |
|
- type: recall_at_5 |
|
value: 43.396 |
|
- task: |
|
type: PairClassification |
|
dataset: |
|
type: C-MTEB/CMNLI |
|
name: MTEB Cmnli |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: cos_sim_accuracy |
|
value: 68.91160553217077 |
|
- type: cos_sim_ap |
|
value: 76.45769658379533 |
|
- type: cos_sim_f1 |
|
value: 72.07988702844463 |
|
- type: cos_sim_precision |
|
value: 63.384779137839274 |
|
- type: cos_sim_recall |
|
value: 83.53986439092822 |
|
- type: dot_accuracy |
|
value: 68.91160553217077 |
|
- type: dot_ap |
|
value: 76.47279917239219 |
|
- type: dot_f1 |
|
value: 72.07988702844463 |
|
- type: dot_precision |
|
value: 63.384779137839274 |
|
- type: dot_recall |
|
value: 83.53986439092822 |
|
- type: euclidean_accuracy |
|
value: 68.91160553217077 |
|
- type: euclidean_ap |
|
value: 76.45768544225383 |
|
- type: euclidean_f1 |
|
value: 72.07988702844463 |
|
- type: euclidean_precision |
|
value: 63.384779137839274 |
|
- type: euclidean_recall |
|
value: 83.53986439092822 |
|
- type: manhattan_accuracy |
|
value: 69.21226698737222 |
|
- type: manhattan_ap |
|
value: 76.6623683693766 |
|
- type: manhattan_f1 |
|
value: 72.14058164628506 |
|
- type: manhattan_precision |
|
value: 64.35643564356435 |
|
- type: manhattan_recall |
|
value: 82.06686930091185 |
|
- type: max_accuracy |
|
value: 69.21226698737222 |
|
- type: max_ap |
|
value: 76.6623683693766 |
|
- type: max_f1 |
|
value: 72.14058164628506 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/CovidRetrieval |
|
name: MTEB CovidRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 48.419000000000004 |
|
- type: map_at_10 |
|
value: 57.367999999999995 |
|
- type: map_at_100 |
|
value: 58.081 |
|
- type: map_at_1000 |
|
value: 58.108000000000004 |
|
- type: map_at_3 |
|
value: 55.251 |
|
- type: map_at_5 |
|
value: 56.53399999999999 |
|
- type: mrr_at_1 |
|
value: 48.472 |
|
- type: mrr_at_10 |
|
value: 57.359 |
|
- type: mrr_at_100 |
|
value: 58.055 |
|
- type: mrr_at_1000 |
|
value: 58.082 |
|
- type: mrr_at_3 |
|
value: 55.303999999999995 |
|
- type: mrr_at_5 |
|
value: 56.542 |
|
- type: ndcg_at_1 |
|
value: 48.472 |
|
- type: ndcg_at_10 |
|
value: 61.651999999999994 |
|
- type: ndcg_at_100 |
|
value: 65.257 |
|
- type: ndcg_at_1000 |
|
value: 65.977 |
|
- type: ndcg_at_3 |
|
value: 57.401 |
|
- type: ndcg_at_5 |
|
value: 59.681 |
|
- type: precision_at_1 |
|
value: 48.472 |
|
- type: precision_at_10 |
|
value: 7.576 |
|
- type: precision_at_100 |
|
value: 0.932 |
|
- type: precision_at_1000 |
|
value: 0.099 |
|
- type: precision_at_3 |
|
value: 21.25 |
|
- type: precision_at_5 |
|
value: 13.888 |
|
- type: recall_at_1 |
|
value: 48.419000000000004 |
|
- type: recall_at_10 |
|
value: 74.97399999999999 |
|
- type: recall_at_100 |
|
value: 92.202 |
|
- type: recall_at_1000 |
|
value: 97.893 |
|
- type: recall_at_3 |
|
value: 63.541000000000004 |
|
- type: recall_at_5 |
|
value: 68.994 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/DuRetrieval |
|
name: MTEB DuRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 22.328 |
|
- type: map_at_10 |
|
value: 69.11 |
|
- type: map_at_100 |
|
value: 72.47 |
|
- type: map_at_1000 |
|
value: 72.54599999999999 |
|
- type: map_at_3 |
|
value: 46.938 |
|
- type: map_at_5 |
|
value: 59.56 |
|
- type: mrr_at_1 |
|
value: 81.35 |
|
- type: mrr_at_10 |
|
value: 87.066 |
|
- type: mrr_at_100 |
|
value: 87.212 |
|
- type: mrr_at_1000 |
|
value: 87.21799999999999 |
|
- type: mrr_at_3 |
|
value: 86.558 |
|
- type: mrr_at_5 |
|
value: 86.931 |
|
- type: ndcg_at_1 |
|
value: 81.35 |
|
- type: ndcg_at_10 |
|
value: 78.568 |
|
- type: ndcg_at_100 |
|
value: 82.86099999999999 |
|
- type: ndcg_at_1000 |
|
value: 83.628 |
|
- type: ndcg_at_3 |
|
value: 76.716 |
|
- type: ndcg_at_5 |
|
value: 75.664 |
|
- type: precision_at_1 |
|
value: 81.35 |
|
- type: precision_at_10 |
|
value: 38.545 |
|
- type: precision_at_100 |
|
value: 4.657 |
|
- type: precision_at_1000 |
|
value: 0.484 |
|
- type: precision_at_3 |
|
value: 69.18299999999999 |
|
- type: precision_at_5 |
|
value: 58.67 |
|
- type: recall_at_1 |
|
value: 22.328 |
|
- type: recall_at_10 |
|
value: 80.658 |
|
- type: recall_at_100 |
|
value: 94.093 |
|
- type: recall_at_1000 |
|
value: 98.137 |
|
- type: recall_at_3 |
|
value: 50.260000000000005 |
|
- type: recall_at_5 |
|
value: 66.045 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/EcomRetrieval |
|
name: MTEB EcomRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 43.1 |
|
- type: map_at_10 |
|
value: 52.872 |
|
- type: map_at_100 |
|
value: 53.556000000000004 |
|
- type: map_at_1000 |
|
value: 53.583000000000006 |
|
- type: map_at_3 |
|
value: 50.14999999999999 |
|
- type: map_at_5 |
|
value: 51.925 |
|
- type: mrr_at_1 |
|
value: 43.1 |
|
- type: mrr_at_10 |
|
value: 52.872 |
|
- type: mrr_at_100 |
|
value: 53.556000000000004 |
|
- type: mrr_at_1000 |
|
value: 53.583000000000006 |
|
- type: mrr_at_3 |
|
value: 50.14999999999999 |
|
- type: mrr_at_5 |
|
value: 51.925 |
|
- type: ndcg_at_1 |
|
value: 43.1 |
|
- type: ndcg_at_10 |
|
value: 57.907 |
|
- type: ndcg_at_100 |
|
value: 61.517999999999994 |
|
- type: ndcg_at_1000 |
|
value: 62.175000000000004 |
|
- type: ndcg_at_3 |
|
value: 52.425 |
|
- type: ndcg_at_5 |
|
value: 55.631 |
|
- type: precision_at_1 |
|
value: 43.1 |
|
- type: precision_at_10 |
|
value: 7.380000000000001 |
|
- type: precision_at_100 |
|
value: 0.9129999999999999 |
|
- type: precision_at_1000 |
|
value: 0.096 |
|
- type: precision_at_3 |
|
value: 19.667 |
|
- type: precision_at_5 |
|
value: 13.36 |
|
- type: recall_at_1 |
|
value: 43.1 |
|
- type: recall_at_10 |
|
value: 73.8 |
|
- type: recall_at_100 |
|
value: 91.3 |
|
- type: recall_at_1000 |
|
value: 96.39999999999999 |
|
- type: recall_at_3 |
|
value: 59.0 |
|
- type: recall_at_5 |
|
value: 66.8 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/IFlyTek-classification |
|
name: MTEB IFlyTek |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 41.146594844170835 |
|
- type: f1 |
|
value: 28.544218732704845 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/JDReview-classification |
|
name: MTEB JDReview |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 82.83302063789868 |
|
- type: ap |
|
value: 48.881798834997056 |
|
- type: f1 |
|
value: 77.28655923994657 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/LCQMC |
|
name: MTEB LCQMC |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 66.05467125345538 |
|
- type: cos_sim_spearman |
|
value: 72.71921060562211 |
|
- type: euclidean_pearson |
|
value: 71.28539457113986 |
|
- type: euclidean_spearman |
|
value: 72.71920173126693 |
|
- type: manhattan_pearson |
|
value: 71.23750818174456 |
|
- type: manhattan_spearman |
|
value: 72.61025268693467 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/Mmarco-reranking |
|
name: MTEB MMarcoReranking |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 26.127712982639483 |
|
- type: mrr |
|
value: 24.87420634920635 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/MMarcoRetrieval |
|
name: MTEB MMarcoRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 62.517 |
|
- type: map_at_10 |
|
value: 71.251 |
|
- type: map_at_100 |
|
value: 71.647 |
|
- type: map_at_1000 |
|
value: 71.665 |
|
- type: map_at_3 |
|
value: 69.28 |
|
- type: map_at_5 |
|
value: 70.489 |
|
- type: mrr_at_1 |
|
value: 64.613 |
|
- type: mrr_at_10 |
|
value: 71.89 |
|
- type: mrr_at_100 |
|
value: 72.243 |
|
- type: mrr_at_1000 |
|
value: 72.259 |
|
- type: mrr_at_3 |
|
value: 70.138 |
|
- type: mrr_at_5 |
|
value: 71.232 |
|
- type: ndcg_at_1 |
|
value: 64.613 |
|
- type: ndcg_at_10 |
|
value: 75.005 |
|
- type: ndcg_at_100 |
|
value: 76.805 |
|
- type: ndcg_at_1000 |
|
value: 77.281 |
|
- type: ndcg_at_3 |
|
value: 71.234 |
|
- type: ndcg_at_5 |
|
value: 73.294 |
|
- type: precision_at_1 |
|
value: 64.613 |
|
- type: precision_at_10 |
|
value: 9.142 |
|
- type: precision_at_100 |
|
value: 1.004 |
|
- type: precision_at_1000 |
|
value: 0.104 |
|
- type: precision_at_3 |
|
value: 26.781 |
|
- type: precision_at_5 |
|
value: 17.149 |
|
- type: recall_at_1 |
|
value: 62.517 |
|
- type: recall_at_10 |
|
value: 85.997 |
|
- type: recall_at_100 |
|
value: 94.18299999999999 |
|
- type: recall_at_1000 |
|
value: 97.911 |
|
- type: recall_at_3 |
|
value: 75.993 |
|
- type: recall_at_5 |
|
value: 80.88300000000001 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: mteb/amazon_massive_intent |
|
name: MTEB MassiveIntentClassification (zh-CN) |
|
config: zh-CN |
|
split: test |
|
revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 |
|
metrics: |
|
- type: accuracy |
|
value: 59.27706792199058 |
|
- type: f1 |
|
value: 56.77545011902468 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: mteb/amazon_massive_scenario |
|
name: MTEB MassiveScenarioClassification (zh-CN) |
|
config: zh-CN |
|
split: test |
|
revision: 7d571f92784cd94a019292a1f45445077d0ef634 |
|
metrics: |
|
- type: accuracy |
|
value: 66.47948890383321 |
|
- type: f1 |
|
value: 66.4502180376861 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/MedicalRetrieval |
|
name: MTEB MedicalRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 54.2 |
|
- type: map_at_10 |
|
value: 59.858 |
|
- type: map_at_100 |
|
value: 60.46 |
|
- type: map_at_1000 |
|
value: 60.507 |
|
- type: map_at_3 |
|
value: 58.416999999999994 |
|
- type: map_at_5 |
|
value: 59.331999999999994 |
|
- type: mrr_at_1 |
|
value: 54.2 |
|
- type: mrr_at_10 |
|
value: 59.862 |
|
- type: mrr_at_100 |
|
value: 60.463 |
|
- type: mrr_at_1000 |
|
value: 60.51 |
|
- type: mrr_at_3 |
|
value: 58.416999999999994 |
|
- type: mrr_at_5 |
|
value: 59.352000000000004 |
|
- type: ndcg_at_1 |
|
value: 54.2 |
|
- type: ndcg_at_10 |
|
value: 62.643 |
|
- type: ndcg_at_100 |
|
value: 65.731 |
|
- type: ndcg_at_1000 |
|
value: 67.096 |
|
- type: ndcg_at_3 |
|
value: 59.727 |
|
- type: ndcg_at_5 |
|
value: 61.375 |
|
- type: precision_at_1 |
|
value: 54.2 |
|
- type: precision_at_10 |
|
value: 7.140000000000001 |
|
- type: precision_at_100 |
|
value: 0.8619999999999999 |
|
- type: precision_at_1000 |
|
value: 0.097 |
|
- type: precision_at_3 |
|
value: 21.166999999999998 |
|
- type: precision_at_5 |
|
value: 13.5 |
|
- type: recall_at_1 |
|
value: 54.2 |
|
- type: recall_at_10 |
|
value: 71.39999999999999 |
|
- type: recall_at_100 |
|
value: 86.2 |
|
- type: recall_at_1000 |
|
value: 97.2 |
|
- type: recall_at_3 |
|
value: 63.5 |
|
- type: recall_at_5 |
|
value: 67.5 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/MultilingualSentiment-classification |
|
name: MTEB MultilingualSentiment |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 68.19666666666666 |
|
- type: f1 |
|
value: 67.58581661416034 |
|
- task: |
|
type: PairClassification |
|
dataset: |
|
type: C-MTEB/OCNLI |
|
name: MTEB Ocnli |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: cos_sim_accuracy |
|
value: 60.530590146182995 |
|
- type: cos_sim_ap |
|
value: 63.53656091243922 |
|
- type: cos_sim_f1 |
|
value: 68.09929603556874 |
|
- type: cos_sim_precision |
|
value: 52.45433789954338 |
|
- type: cos_sim_recall |
|
value: 97.04329461457233 |
|
- type: dot_accuracy |
|
value: 60.530590146182995 |
|
- type: dot_ap |
|
value: 63.53660452157237 |
|
- type: dot_f1 |
|
value: 68.09929603556874 |
|
- type: dot_precision |
|
value: 52.45433789954338 |
|
- type: dot_recall |
|
value: 97.04329461457233 |
|
- type: euclidean_accuracy |
|
value: 60.530590146182995 |
|
- type: euclidean_ap |
|
value: 63.53678735855631 |
|
- type: euclidean_f1 |
|
value: 68.09929603556874 |
|
- type: euclidean_precision |
|
value: 52.45433789954338 |
|
- type: euclidean_recall |
|
value: 97.04329461457233 |
|
- type: manhattan_accuracy |
|
value: 60.47644829453167 |
|
- type: manhattan_ap |
|
value: 63.5622508250315 |
|
- type: manhattan_f1 |
|
value: 68.1650700073692 |
|
- type: manhattan_precision |
|
value: 52.34861346915677 |
|
- type: manhattan_recall |
|
value: 97.67687434002113 |
|
- type: max_accuracy |
|
value: 60.530590146182995 |
|
- type: max_ap |
|
value: 63.5622508250315 |
|
- type: max_f1 |
|
value: 68.1650700073692 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/OnlineShopping-classification |
|
name: MTEB OnlineShopping |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 89.13 |
|
- type: ap |
|
value: 87.21879260137172 |
|
- type: f1 |
|
value: 89.12359325300508 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/PAWSX |
|
name: MTEB PAWSX |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 12.035577637900758 |
|
- type: cos_sim_spearman |
|
value: 12.76524190663864 |
|
- type: euclidean_pearson |
|
value: 14.4012689427106 |
|
- type: euclidean_spearman |
|
value: 12.765328992583608 |
|
- type: manhattan_pearson |
|
value: 14.458505202938946 |
|
- type: manhattan_spearman |
|
value: 12.763238700117896 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/QBQTC |
|
name: MTEB QBQTC |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 34.809415339934006 |
|
- type: cos_sim_spearman |
|
value: 36.96728615916954 |
|
- type: euclidean_pearson |
|
value: 35.56113673772396 |
|
- type: euclidean_spearman |
|
value: 36.96842963389308 |
|
- type: manhattan_pearson |
|
value: 35.5447066178264 |
|
- type: manhattan_spearman |
|
value: 36.97514513480951 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: mteb/sts22-crosslingual-sts |
|
name: MTEB STS22 (zh) |
|
config: zh |
|
split: test |
|
revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80 |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 66.39448692338551 |
|
- type: cos_sim_spearman |
|
value: 66.72211526923901 |
|
- type: euclidean_pearson |
|
value: 65.72981824553035 |
|
- type: euclidean_spearman |
|
value: 66.72211526923901 |
|
- type: manhattan_pearson |
|
value: 65.52315559414296 |
|
- type: manhattan_spearman |
|
value: 66.61931702511545 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/STSB |
|
name: MTEB STSB |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 76.73608064460915 |
|
- type: cos_sim_spearman |
|
value: 76.51424826130031 |
|
- type: euclidean_pearson |
|
value: 76.17930213372487 |
|
- type: euclidean_spearman |
|
value: 76.51342756283478 |
|
- type: manhattan_pearson |
|
value: 75.87085607319342 |
|
- type: manhattan_spearman |
|
value: 76.22676341477134 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/T2Reranking |
|
name: MTEB T2Reranking |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 65.38779931543048 |
|
- type: mrr |
|
value: 74.79313763420059 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/T2Retrieval |
|
name: MTEB T2Retrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 25.131999999999998 |
|
- type: map_at_10 |
|
value: 69.131 |
|
- type: map_at_100 |
|
value: 72.943 |
|
- type: map_at_1000 |
|
value: 73.045 |
|
- type: map_at_3 |
|
value: 48.847 |
|
- type: map_at_5 |
|
value: 59.842 |
|
- type: mrr_at_1 |
|
value: 85.516 |
|
- type: mrr_at_10 |
|
value: 88.863 |
|
- type: mrr_at_100 |
|
value: 88.996 |
|
- type: mrr_at_1000 |
|
value: 89.00099999999999 |
|
- type: mrr_at_3 |
|
value: 88.277 |
|
- type: mrr_at_5 |
|
value: 88.64800000000001 |
|
- type: ndcg_at_1 |
|
value: 85.516 |
|
- type: ndcg_at_10 |
|
value: 78.122 |
|
- type: ndcg_at_100 |
|
value: 82.673 |
|
- type: ndcg_at_1000 |
|
value: 83.707 |
|
- type: ndcg_at_3 |
|
value: 80.274 |
|
- type: ndcg_at_5 |
|
value: 78.405 |
|
- type: precision_at_1 |
|
value: 85.516 |
|
- type: precision_at_10 |
|
value: 38.975 |
|
- type: precision_at_100 |
|
value: 4.833 |
|
- type: precision_at_1000 |
|
value: 0.509 |
|
- type: precision_at_3 |
|
value: 70.35 |
|
- type: precision_at_5 |
|
value: 58.638 |
|
- type: recall_at_1 |
|
value: 25.131999999999998 |
|
- type: recall_at_10 |
|
value: 76.848 |
|
- type: recall_at_100 |
|
value: 91.489 |
|
- type: recall_at_1000 |
|
value: 96.709 |
|
- type: recall_at_3 |
|
value: 50.824000000000005 |
|
- type: recall_at_5 |
|
value: 63.89 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/TNews-classification |
|
name: MTEB TNews |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 49.65 |
|
- type: f1 |
|
value: 47.66791473245483 |
|
- task: |
|
type: Clustering |
|
dataset: |
|
type: C-MTEB/ThuNewsClusteringP2P |
|
name: MTEB ThuNewsClusteringP2P |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: v_measure |
|
value: 63.78843565968542 |
|
- task: |
|
type: Clustering |
|
dataset: |
|
type: C-MTEB/ThuNewsClusteringS2S |
|
name: MTEB ThuNewsClusteringS2S |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: v_measure |
|
value: 55.14095244943176 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/VideoRetrieval |
|
name: MTEB VideoRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 53.800000000000004 |
|
- type: map_at_10 |
|
value: 63.312000000000005 |
|
- type: map_at_100 |
|
value: 63.93600000000001 |
|
- type: map_at_1000 |
|
value: 63.955 |
|
- type: map_at_3 |
|
value: 61.283 |
|
- type: map_at_5 |
|
value: 62.553000000000004 |
|
- type: mrr_at_1 |
|
value: 53.800000000000004 |
|
- type: mrr_at_10 |
|
value: 63.312000000000005 |
|
- type: mrr_at_100 |
|
value: 63.93600000000001 |
|
- type: mrr_at_1000 |
|
value: 63.955 |
|
- type: mrr_at_3 |
|
value: 61.283 |
|
- type: mrr_at_5 |
|
value: 62.553000000000004 |
|
- type: ndcg_at_1 |
|
value: 53.800000000000004 |
|
- type: ndcg_at_10 |
|
value: 67.693 |
|
- type: ndcg_at_100 |
|
value: 70.552 |
|
- type: ndcg_at_1000 |
|
value: 71.06099999999999 |
|
- type: ndcg_at_3 |
|
value: 63.632 |
|
- type: ndcg_at_5 |
|
value: 65.90899999999999 |
|
- type: precision_at_1 |
|
value: 53.800000000000004 |
|
- type: precision_at_10 |
|
value: 8.129999999999999 |
|
- type: precision_at_100 |
|
value: 0.943 |
|
- type: precision_at_1000 |
|
value: 0.098 |
|
- type: precision_at_3 |
|
value: 23.467 |
|
- type: precision_at_5 |
|
value: 15.18 |
|
- type: recall_at_1 |
|
value: 53.800000000000004 |
|
- type: recall_at_10 |
|
value: 81.3 |
|
- type: recall_at_100 |
|
value: 94.3 |
|
- type: recall_at_1000 |
|
value: 98.3 |
|
- type: recall_at_3 |
|
value: 70.39999999999999 |
|
- type: recall_at_5 |
|
value: 75.9 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/waimai-classification |
|
name: MTEB Waimai |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 84.96000000000001 |
|
- type: ap |
|
value: 66.89917287702019 |
|
- type: f1 |
|
value: 83.0239988458119 |
|
language: |
|
- en |
|
license: mit |
|
--- |
|
|
|
# gte-small-zh |
|
|
|
General Text Embeddings (GTE) model. [Towards General Text Embeddings with Multi-stage Contrastive Learning](https://arxiv.org/abs/2308.03281) |
|
|
|
The GTE models are trained by Alibaba DAMO Academy. They are mainly based on the BERT framework and currently offer different sizes of models for both Chinese and English Languages. The GTE models are trained on a large-scale corpus of relevance text pairs, covering a wide range of domains and scenarios. This enables the GTE models to be applied to various downstream tasks of text embeddings, including **information retrieval**, **semantic textual similarity**, **text reranking**, etc. |
|
|
|
## Model List |
|
|
|
| Models | Language | Max Sequence Length | Dimension | Model Size | |
|
|:-----: | :-----: |:-----: |:-----: |:-----: | |
|
|[GTE-large-zh](https://huggingface.co/thenlper/gte-large-zh) | Chinese | 512 | 1024 | 0.67GB | |
|
|[GTE-base-zh](https://huggingface.co/thenlper/gte-base-zh) | Chinese | 512 | 512 | 0.21GB | |
|
|[GTE-small-zh](https://huggingface.co/thenlper/gte-small-zh) | Chinese | 512 | 512 | 0.10GB | |
|
|[GTE-large](https://huggingface.co/thenlper/gte-large) | English | 512 | 1024 | 0.67GB | |
|
|[GTE-base](https://huggingface.co/thenlper/gte-base) | English | 512 | 512 | 0.21GB | |
|
|[GTE-small](https://huggingface.co/thenlper/gte-small) | English | 512 | 384 | 0.10GB | |
|
|
|
## Metrics |
|
|
|
We compared the performance of the GTE models with other popular text embedding models on the MTEB (CMTEB for Chinese language) benchmark. For more detailed comparison results, please refer to the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). |
|
|
|
- Evaluation results on CMTEB |
|
|
|
| Model | Model Size (GB) | Embedding Dimensions | Sequence Length | Average (35 datasets) | Classification (9 datasets) | Clustering (4 datasets) | Pair Classification (2 datasets) | Reranking (4 datasets) | Retrieval (8 datasets) | STS (8 datasets) | |
|
| ------------------- | -------------- | -------------------- | ---------------- | --------------------- | ------------------------------------ | ------------------------------ | --------------------------------------- | ------------------------------ | ---------------------------- | ------------------------ | |
|
| **gte-large-zh** | 0.65 | 1024 | 512 | **66.72** | 71.34 | 53.07 | 81.14 | 67.42 | 72.49 | 57.82 | |
|
| gte-base-zh | 0.20 | 768 | 512 | 65.92 | 71.26 | 53.86 | 80.44 | 67.00 | 71.71 | 55.96 | |
|
| stella-large-zh-v2 | 0.65 | 1024 | 1024 | 65.13 | 69.05 | 49.16 | 82.68 | 66.41 | 70.14 | 58.66 | |
|
| stella-large-zh | 0.65 | 1024 | 1024 | 64.54 | 67.62 | 48.65 | 78.72 | 65.98 | 71.02 | 58.3 | |
|
| bge-large-zh-v1.5 | 1.3 | 1024 | 512 | 64.53 | 69.13 | 48.99 | 81.6 | 65.84 | 70.46 | 56.25 | |
|
| stella-base-zh-v2 | 0.21 | 768 | 1024 | 64.36 | 68.29 | 49.4 | 79.96 | 66.1 | 70.08 | 56.92 | |
|
| stella-base-zh | 0.21 | 768 | 1024 | 64.16 | 67.77 | 48.7 | 76.09 | 66.95 | 71.07 | 56.54 | |
|
| piccolo-large-zh | 0.65 | 1024 | 512 | 64.11 | 67.03 | 47.04 | 78.38 | 65.98 | 70.93 | 58.02 | |
|
| piccolo-base-zh | 0.2 | 768 | 512 | 63.66 | 66.98 | 47.12 | 76.61 | 66.68 | 71.2 | 55.9 | |
|
| gte-small-zh | 0.1 | 512 | 512 | 60.04 | 64.35 | 48.95 | 69.99 | 66.21 | 65.50 | 49.72 | |
|
| bge-small-zh-v1.5 | 0.1 | 512 | 512 | 57.82 | 63.96 | 44.18 | 70.4 | 60.92 | 61.77 | 49.1 | |
|
| m3e-base | 0.41 | 768 | 512 | 57.79 | 67.52 | 47.68 | 63.99 | 59.54| 56.91 | 50.47 | |
|
|text-embedding-ada-002(openai) | - | 1536| 8192 | 53.02 | 64.31 | 45.68 | 69.56 | 54.28 | 52.0 | 43.35 | |
|
|
|
|
|
## Usage |
|
|
|
Code example |
|
|
|
```python |
|
import torch.nn.functional as F |
|
from torch import Tensor |
|
from transformers import AutoTokenizer, AutoModel |
|
|
|
input_texts = [ |
|
"中国的首都是哪里", |
|
"你喜欢去哪里旅游", |
|
"北京", |
|
"今天中午吃什么" |
|
] |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small-zh") |
|
model = AutoModel.from_pretrained("thenlper/gte-small-zh") |
|
|
|
# Tokenize the input texts |
|
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt') |
|
|
|
outputs = model(**batch_dict) |
|
embeddings = outputs.last_hidden_state[:, 0] |
|
|
|
# (Optionally) normalize embeddings |
|
embeddings = F.normalize(embeddings, p=2, dim=1) |
|
scores = (embeddings[:1] @ embeddings[1:].T) * 100 |
|
print(scores.tolist()) |
|
``` |
|
|
|
Use with sentence-transformers: |
|
|
|
```python |
|
from sentence_transformers import SentenceTransformer |
|
from sentence_transformers.util import cos_sim |
|
|
|
sentences = ['That is a happy person', 'That is a very happy person'] |
|
|
|
model = SentenceTransformer('thenlper/gte-small-zh') |
|
embeddings = model.encode(sentences) |
|
print(cos_sim(embeddings[0], embeddings[1])) |
|
``` |
|
|
|
### Limitation |
|
|
|
This model exclusively caters to Chinese texts, and any lengthy texts will be truncated to a maximum of 512 tokens. |
|
|
|
### Citation |
|
|
|
If you find our paper or models helpful, please consider citing them as follows: |
|
|
|
``` |
|
@article{li2023towards, |
|
title={Towards general text embeddings with multi-stage contrastive learning}, |
|
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan}, |
|
journal={arXiv preprint arXiv:2308.03281}, |
|
year={2023} |
|
} |
|
``` |
|
|