mrm8488's picture
Add new SentenceTransformer model.
a900b52 verified
metadata
language:
  - en
library_name: sentence-transformers
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - dataset_size:1K<n<10K
  - loss:MatryoshkaLoss
  - loss:CoSENTLoss
base_model: distilbert/distilbert-base-uncased
metrics:
  - pearson_cosine
  - spearman_cosine
  - pearson_manhattan
  - spearman_manhattan
  - pearson_euclidean
  - spearman_euclidean
  - pearson_dot
  - spearman_dot
  - pearson_max
  - spearman_max
widget:
  - source_sentence: A woman is dancing.
    sentences:
      - Women are dancing.
      - A toddler walks down a hallway.
      - Shinzo Abe is Japan's prime minister
  - source_sentence: A man is spitting.
    sentences:
      - A man is crying.
      - The girl is playing the guitar.
      - A slow loris hanging on a cord.
  - source_sentence: A man is speaking.
    sentences:
      - A man is talking.
      - A man plays an acoustic guitar.
      - The dogs are chasing a cat.
  - source_sentence: A plane in the sky.
    sentences:
      - Two airplanes in the sky.
      - A slow loris hanging on a cord.
      - Turkey's PM Warns Against Protests
  - source_sentence: A baby is laughing.
    sentences:
      - The baby laughed in his car seat.
      - A brown horse in a green field.
      - Bangladesh Islamist leader executed
pipeline_tag: sentence-similarity
model-index:
  - name: SentenceTransformer based on distilbert/distilbert-base-uncased
    results:
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 768
          type: sts-dev-768
        metrics:
          - type: pearson_cosine
            value: 0.8597256789475689
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8704890959686488
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8577087236028236
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8613364457717408
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8573646665610765
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8611053939518858
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7230928823966007
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7292814320710974
            name: Spearman Dot
          - type: pearson_max
            value: 0.8597256789475689
            name: Pearson Max
          - type: spearman_max
            value: 0.8704890959686488
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 512
          type: sts-dev-512
        metrics:
          - type: pearson_cosine
            value: 0.8565849984058084
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8690380994355429
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8560989283234569
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8602048185493963
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8560319360006069
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8598344132114529
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7250593470322173
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7324935808414036
            name: Spearman Dot
          - type: pearson_max
            value: 0.8565849984058084
            name: Pearson Max
          - type: spearman_max
            value: 0.8690380994355429
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 256
          type: sts-dev-256
        metrics:
          - type: pearson_cosine
            value: 0.8508677416837496
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8655671620679589
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8516296649395021
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8576372447474295
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8512958746883122
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8567348597207523
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.691266333570308
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6983564197469347
            name: Spearman Dot
          - type: pearson_max
            value: 0.8516296649395021
            name: Pearson Max
          - type: spearman_max
            value: 0.8655671620679589
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 128
          type: sts-dev-128
        metrics:
          - type: pearson_cosine
            value: 0.8416379040782492
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8625866345174488
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8410105415496507
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8496221523132089
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8431760561066126
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8505697779445824
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.677560950193549
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6864851260895027
            name: Spearman Dot
          - type: pearson_max
            value: 0.8431760561066126
            name: Pearson Max
          - type: spearman_max
            value: 0.8625866345174488
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 64
          type: sts-dev-64
        metrics:
          - type: pearson_cosine
            value: 0.823170809036498
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8523184158399918
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8255414664543136
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8358413125165197
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8292011526410756
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8385242101250404
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.641639319620455
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6564088055361835
            name: Spearman Dot
          - type: pearson_max
            value: 0.8292011526410756
            name: Pearson Max
          - type: spearman_max
            value: 0.8523184158399918
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 32
          type: sts-dev-32
        metrics:
          - type: pearson_cosine
            value: 0.7903418859430655
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8327625705936669
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8031537655331857
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8168069966906343
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8078549989079483
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8195679102426064
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5951512690504269
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5992430550243973
            name: Spearman Dot
          - type: pearson_max
            value: 0.8078549989079483
            name: Pearson Max
          - type: spearman_max
            value: 0.8327625705936669
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 768
          type: sts-test-768
        metrics:
          - type: pearson_cosine
            value: 0.8259116102299048
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8420103291660583
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8417036739734224
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.839403978426242
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8416944892693242
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8392814362849023
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6531059298507882
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6395643411764597
            name: Spearman Dot
          - type: pearson_max
            value: 0.8417036739734224
            name: Pearson Max
          - type: spearman_max
            value: 0.8420103291660583
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 512
          type: sts-test-512
        metrics:
          - type: pearson_cosine
            value: 0.8243325623482549
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8417788357334501
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8405895269265039
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8387513037939833
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8405749756794761
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8386191956000736
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6577547074460394
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6453398362527448
            name: Spearman Dot
          - type: pearson_max
            value: 0.8405895269265039
            name: Pearson Max
          - type: spearman_max
            value: 0.8417788357334501
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 256
          type: sts-test-256
        metrics:
          - type: pearson_cosine
            value: 0.8128490933340125
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8343525276981816
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8349925426973063
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8339373046648948
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8349685334828352
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8342389147888624
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6010530472572276
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5827176472260001
            name: Spearman Dot
          - type: pearson_max
            value: 0.8349925426973063
            name: Pearson Max
          - type: spearman_max
            value: 0.8343525276981816
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 128
          type: sts-test-128
        metrics:
          - type: pearson_cosine
            value: 0.8037074044935162
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8297484250803338
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8282523311738189
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8292579770469635
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.828555014804415
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8294547431431344
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.579341375708575
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5659659830073487
            name: Spearman Dot
          - type: pearson_max
            value: 0.828555014804415
            name: Pearson Max
          - type: spearman_max
            value: 0.8297484250803338
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 64
          type: sts-test-64
        metrics:
          - type: pearson_cosine
            value: 0.7861572380387101
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8221344542757412
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8179044736790866
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8218843830925717
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8199399298670013
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8240682904452457
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5115276911122266
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5024074247877125
            name: Spearman Dot
          - type: pearson_max
            value: 0.8199399298670013
            name: Pearson Max
          - type: spearman_max
            value: 0.8240682904452457
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 32
          type: sts-test-32
        metrics:
          - type: pearson_cosine
            value: 0.7616404560065974
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8126281001961144
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.7995560120404742
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8084393007868024
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8024415842761214
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8115677983458126
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.4646775610104062
            name: Pearson Dot
          - type: spearman_dot
            value: 0.451018702626726
            name: Spearman Dot
          - type: pearson_max
            value: 0.8024415842761214
            name: Pearson Max
          - type: spearman_max
            value: 0.8126281001961144
            name: Spearman Max

SentenceTransformer based on distilbert/distilbert-base-uncased

This is a sentence-transformers model finetuned from distilbert/distilbert-base-uncased on the sentence-transformers/stsb dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

Model Sources

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("mrm8488/distilbert-base-matryoshka-sts")
# Run inference
sentences = [
    'A baby is laughing.',
    'The baby laughed in his car seat.',
    'A brown horse in a green field.',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Semantic Similarity

Metric Value
pearson_cosine 0.8597
spearman_cosine 0.8705
pearson_manhattan 0.8577
spearman_manhattan 0.8613
pearson_euclidean 0.8574
spearman_euclidean 0.8611
pearson_dot 0.7231
spearman_dot 0.7293
pearson_max 0.8597
spearman_max 0.8705

Semantic Similarity

Metric Value
pearson_cosine 0.8566
spearman_cosine 0.869
pearson_manhattan 0.8561
spearman_manhattan 0.8602
pearson_euclidean 0.856
spearman_euclidean 0.8598
pearson_dot 0.7251
spearman_dot 0.7325
pearson_max 0.8566
spearman_max 0.869

Semantic Similarity

Metric Value
pearson_cosine 0.8509
spearman_cosine 0.8656
pearson_manhattan 0.8516
spearman_manhattan 0.8576
pearson_euclidean 0.8513
spearman_euclidean 0.8567
pearson_dot 0.6913
spearman_dot 0.6984
pearson_max 0.8516
spearman_max 0.8656

Semantic Similarity

Metric Value
pearson_cosine 0.8416
spearman_cosine 0.8626
pearson_manhattan 0.841
spearman_manhattan 0.8496
pearson_euclidean 0.8432
spearman_euclidean 0.8506
pearson_dot 0.6776
spearman_dot 0.6865
pearson_max 0.8432
spearman_max 0.8626

Semantic Similarity

Metric Value
pearson_cosine 0.8232
spearman_cosine 0.8523
pearson_manhattan 0.8255
spearman_manhattan 0.8358
pearson_euclidean 0.8292
spearman_euclidean 0.8385
pearson_dot 0.6416
spearman_dot 0.6564
pearson_max 0.8292
spearman_max 0.8523

Semantic Similarity

Metric Value
pearson_cosine 0.7903
spearman_cosine 0.8328
pearson_manhattan 0.8032
spearman_manhattan 0.8168
pearson_euclidean 0.8079
spearman_euclidean 0.8196
pearson_dot 0.5952
spearman_dot 0.5992
pearson_max 0.8079
spearman_max 0.8328

Semantic Similarity

Metric Value
pearson_cosine 0.8259
spearman_cosine 0.842
pearson_manhattan 0.8417
spearman_manhattan 0.8394
pearson_euclidean 0.8417
spearman_euclidean 0.8393
pearson_dot 0.6531
spearman_dot 0.6396
pearson_max 0.8417
spearman_max 0.842

Semantic Similarity

Metric Value
pearson_cosine 0.8243
spearman_cosine 0.8418
pearson_manhattan 0.8406
spearman_manhattan 0.8388
pearson_euclidean 0.8406
spearman_euclidean 0.8386
pearson_dot 0.6578
spearman_dot 0.6453
pearson_max 0.8406
spearman_max 0.8418

Semantic Similarity

Metric Value
pearson_cosine 0.8128
spearman_cosine 0.8344
pearson_manhattan 0.835
spearman_manhattan 0.8339
pearson_euclidean 0.835
spearman_euclidean 0.8342
pearson_dot 0.6011
spearman_dot 0.5827
pearson_max 0.835
spearman_max 0.8344

Semantic Similarity

Metric Value
pearson_cosine 0.8037
spearman_cosine 0.8297
pearson_manhattan 0.8283
spearman_manhattan 0.8293
pearson_euclidean 0.8286
spearman_euclidean 0.8295
pearson_dot 0.5793
spearman_dot 0.566
pearson_max 0.8286
spearman_max 0.8297

Semantic Similarity

Metric Value
pearson_cosine 0.7862
spearman_cosine 0.8221
pearson_manhattan 0.8179
spearman_manhattan 0.8219
pearson_euclidean 0.8199
spearman_euclidean 0.8241
pearson_dot 0.5115
spearman_dot 0.5024
pearson_max 0.8199
spearman_max 0.8241

Semantic Similarity

Metric Value
pearson_cosine 0.7616
spearman_cosine 0.8126
pearson_manhattan 0.7996
spearman_manhattan 0.8084
pearson_euclidean 0.8024
spearman_euclidean 0.8116
pearson_dot 0.4647
spearman_dot 0.451
pearson_max 0.8024
spearman_max 0.8126

Training Details

Training Dataset

sentence-transformers/stsb

  • Dataset: sentence-transformers/stsb at ab7a5ac
  • Size: 5,749 training samples
  • Columns: sentence1, sentence2, and score
  • Approximate statistics based on the first 1000 samples:
    sentence1 sentence2 score
    type string string float
    details
    • min: 6 tokens
    • mean: 10.0 tokens
    • max: 28 tokens
    • min: 5 tokens
    • mean: 9.95 tokens
    • max: 25 tokens
    • min: 0.0
    • mean: 0.54
    • max: 1.0
  • Samples:
    sentence1 sentence2 score
    A plane is taking off. An air plane is taking off. 1.0
    A man is playing a large flute. A man is playing a flute. 0.76
    A man is spreading shreded cheese on a pizza. A man is spreading shredded cheese on an uncooked pizza. 0.76
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "CoSENTLoss",
        "matryoshka_dims": [
            768,
            512,
            256,
            128,
            64,
            32
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Evaluation Dataset

sentence-transformers/stsb

  • Dataset: sentence-transformers/stsb at ab7a5ac
  • Size: 1,500 evaluation samples
  • Columns: sentence1, sentence2, and score
  • Approximate statistics based on the first 1000 samples:
    sentence1 sentence2 score
    type string string float
    details
    • min: 5 tokens
    • mean: 15.1 tokens
    • max: 45 tokens
    • min: 6 tokens
    • mean: 15.11 tokens
    • max: 53 tokens
    • min: 0.0
    • mean: 0.47
    • max: 1.0
  • Samples:
    sentence1 sentence2 score
    A man with a hard hat is dancing. A man wearing a hard hat is dancing. 1.0
    A young child is riding a horse. A child is riding a horse. 0.95
    A man is feeding a mouse to a snake. The man is feeding a mouse to the snake. 1.0
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "CoSENTLoss",
        "matryoshka_dims": [
            768,
            512,
            256,
            128,
            64,
            32
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Training Hyperparameters

Non-Default Hyperparameters

  • eval_strategy: steps
  • per_device_train_batch_size: 16
  • per_device_eval_batch_size: 16
  • num_train_epochs: 4
  • warmup_ratio: 0.1
  • fp16: True

All Hyperparameters

Click to expand
  • overwrite_output_dir: False
  • do_predict: False
  • eval_strategy: steps
  • prediction_loss_only: True
  • per_device_train_batch_size: 16
  • per_device_eval_batch_size: 16
  • per_gpu_train_batch_size: None
  • per_gpu_eval_batch_size: None
  • gradient_accumulation_steps: 1
  • eval_accumulation_steps: None
  • learning_rate: 5e-05
  • weight_decay: 0.0
  • adam_beta1: 0.9
  • adam_beta2: 0.999
  • adam_epsilon: 1e-08
  • max_grad_norm: 1.0
  • num_train_epochs: 4
  • max_steps: -1
  • lr_scheduler_type: linear
  • lr_scheduler_kwargs: {}
  • warmup_ratio: 0.1
  • warmup_steps: 0
  • log_level: passive
  • log_level_replica: warning
  • log_on_each_node: True
  • logging_nan_inf_filter: True
  • save_safetensors: True
  • save_on_each_node: False
  • save_only_model: False
  • restore_callback_states_from_checkpoint: False
  • no_cuda: False
  • use_cpu: False
  • use_mps_device: False
  • seed: 42
  • data_seed: None
  • jit_mode_eval: False
  • use_ipex: False
  • bf16: False
  • fp16: True
  • fp16_opt_level: O1
  • half_precision_backend: auto
  • bf16_full_eval: False
  • fp16_full_eval: False
  • tf32: None
  • local_rank: 0
  • ddp_backend: None
  • tpu_num_cores: None
  • tpu_metrics_debug: False
  • debug: []
  • dataloader_drop_last: False
  • dataloader_num_workers: 0
  • dataloader_prefetch_factor: None
  • past_index: -1
  • disable_tqdm: False
  • remove_unused_columns: True
  • label_names: None
  • load_best_model_at_end: False
  • ignore_data_skip: False
  • fsdp: []
  • fsdp_min_num_params: 0
  • fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
  • fsdp_transformer_layer_cls_to_wrap: None
  • accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
  • deepspeed: None
  • label_smoothing_factor: 0.0
  • optim: adamw_torch
  • optim_args: None
  • adafactor: False
  • group_by_length: False
  • length_column_name: length
  • ddp_find_unused_parameters: None
  • ddp_bucket_cap_mb: None
  • ddp_broadcast_buffers: False
  • dataloader_pin_memory: True
  • dataloader_persistent_workers: False
  • skip_memory_metrics: True
  • use_legacy_prediction_loop: False
  • push_to_hub: False
  • resume_from_checkpoint: None
  • hub_model_id: None
  • hub_strategy: every_save
  • hub_private_repo: False
  • hub_always_push: False
  • gradient_checkpointing: False
  • gradient_checkpointing_kwargs: None
  • include_inputs_for_metrics: False
  • eval_do_concat_batches: True
  • fp16_backend: auto
  • push_to_hub_model_id: None
  • push_to_hub_organization: None
  • mp_parameters:
  • auto_find_batch_size: False
  • full_determinism: False
  • torchdynamo: None
  • ray_scope: last
  • ddp_timeout: 1800
  • torch_compile: False
  • torch_compile_backend: None
  • torch_compile_mode: None
  • dispatch_batches: None
  • split_batches: None
  • include_tokens_per_second: False
  • include_num_input_tokens_seen: False
  • neftune_noise_alpha: None
  • optim_target_modules: None
  • batch_eval_metrics: False
  • batch_sampler: batch_sampler
  • multi_dataset_batch_sampler: proportional

Training Logs

Epoch Step Training Loss loss sts-dev-128_spearman_cosine sts-dev-256_spearman_cosine sts-dev-32_spearman_cosine sts-dev-512_spearman_cosine sts-dev-64_spearman_cosine sts-dev-768_spearman_cosine sts-test-128_spearman_cosine sts-test-256_spearman_cosine sts-test-32_spearman_cosine sts-test-512_spearman_cosine sts-test-64_spearman_cosine sts-test-768_spearman_cosine
0.2778 100 28.2763 26.3514 0.8250 0.8306 0.7893 0.8308 0.8094 0.8314 - - - - - -
0.5556 200 26.3731 26.0000 0.8373 0.8412 0.8026 0.8463 0.8267 0.8467 - - - - - -
0.8333 300 26.0243 26.5062 0.8434 0.8495 0.8073 0.8534 0.8297 0.8556 - - - - - -
1.1111 400 25.3448 28.1742 0.8496 0.8544 0.8157 0.8593 0.8361 0.8611 - - - - - -
1.3889 500 24.7922 27.0245 0.8488 0.8529 0.8149 0.8574 0.8352 0.8589 - - - - - -
1.6667 600 24.7596 26.9771 0.8516 0.8558 0.8199 0.8601 0.8389 0.8619 - - - - - -
1.9444 700 24.7165 26.2923 0.8602 0.8634 0.8277 0.8665 0.8476 0.8681 - - - - - -
2.2222 800 23.7934 27.9207 0.8570 0.8608 0.8263 0.8640 0.8460 0.8656 - - - - - -
2.5 900 23.4618 27.5855 0.8583 0.8618 0.8257 0.8657 0.8456 0.8675 - - - - - -
2.7778 1000 23.1831 29.9791 0.8533 0.8557 0.8232 0.8599 0.8411 0.8612 - - - - - -
3.0556 1100 23.1935 28.7866 0.8612 0.8636 0.8329 0.8677 0.8504 0.8689 - - - - - -
3.3333 1200 22.1447 30.0641 0.8597 0.8630 0.8285 0.8661 0.8488 0.8676 - - - - - -
3.6111 1300 21.9271 30.9347 0.8613 0.8648 0.8309 0.8679 0.8509 0.8697 - - - - - -
3.8889 1400 21.973 30.9209 0.8626 0.8656 0.8328 0.8690 0.8523 0.8705 - - - - - -
4.0 1440 - - - - - - - - 0.8297 0.8344 0.8126 0.8418 0.8221 0.8420

Framework Versions

  • Python: 3.10.12
  • Sentence Transformers: 3.0.0
  • Transformers: 4.41.1
  • PyTorch: 2.3.0+cu121
  • Accelerate: 0.30.1
  • Datasets: 2.19.1
  • Tokenizers: 0.19.1

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MatryoshkaLoss

@misc{kusupati2024matryoshka,
    title={Matryoshka Representation Learning}, 
    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
    year={2024},
    eprint={2205.13147},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

CoSENTLoss

@online{kexuefm-8847,
    title={CoSENT: A more efficient sentence vector scheme than Sentence-BERT},
    author={Su Jianlin},
    year={2022},
    month={Jan},
    url={https://kexue.fm/archives/8847},
}