tomaarsen's picture
tomaarsen HF staff
Update README.md
461ec72 verified
metadata
language:
  - en
library_name: sentence-transformers
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - loss:MatryoshkaLoss
  - loss:CoSENTLoss
base_model: distilbert/distilbert-base-uncased
metrics:
  - pearson_cosine
  - spearman_cosine
  - pearson_manhattan
  - spearman_manhattan
  - pearson_euclidean
  - spearman_euclidean
  - pearson_dot
  - spearman_dot
  - pearson_max
  - spearman_max
widget:
  - source_sentence: The gate is yellow.
    sentences:
      - The gate is blue.
      - The person is starting a fire.
      - A woman is bungee jumping.
  - source_sentence: A plane in the sky.
    sentences:
      - Two airplanes in the sky.
      - A man is standing in the rain.
      - There are two men near a wall.
  - source_sentence: A woman is reading.
    sentences:
      - A woman is writing something.
      - A woman is applying eye shadow.
      - A dog and a red ball in the air.
  - source_sentence: A baby is laughing.
    sentences:
      - The baby laughed in his car seat.
      - Suicide bomber strikes in Syria
      - Bangladesh Islamist execution upheld
  - source_sentence: A woman is dancing.
    sentences:
      - A woman is dancing in railway station.
      - The flag was moving in the air.
      - three dogs growling On one another
pipeline_tag: sentence-similarity
co2_eq_emissions:
  emissions: 7.871164130493101
  energy_consumed: 0.020249867843471606
  source: codecarbon
  training_type: fine-tuning
  on_cloud: false
  cpu_model: 13th Gen Intel(R) Core(TM) i7-13700K
  ram_total_size: 31.777088165283203
  hours_used: 0.112
  hardware_used: 1 x NVIDIA GeForce RTX 3090
model-index:
  - name: SentenceTransformer based on distilbert/distilbert-base-uncased
    results:
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 768
          type: sts-dev-768
        metrics:
          - type: pearson_cosine
            value: 0.8647737221000229
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8747521728687471
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8627734228763478
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8657556253211545
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.862712112144467
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8657615257280495
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7442745641899206
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7513830366520415
            name: Spearman Dot
          - type: pearson_max
            value: 0.8647737221000229
            name: Pearson Max
          - type: spearman_max
            value: 0.8747521728687471
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 512
          type: sts-dev-512
        metrics:
          - type: pearson_cosine
            value: 0.8628378541768764
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8741345340758229
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8619744745534216
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8651450292937584
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8622841683977804
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8653280682431165
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.746359236761633
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7540849763868891
            name: Spearman Dot
          - type: pearson_max
            value: 0.8628378541768764
            name: Pearson Max
          - type: spearman_max
            value: 0.8741345340758229
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 256
          type: sts-dev-256
        metrics:
          - type: pearson_cosine
            value: 0.8588975886507025
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8714341050301952
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8590790006287132
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8634123185807864
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8591861535833625
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8628587088112977
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.7185871795192371
            name: Pearson Dot
          - type: spearman_dot
            value: 0.7288595287151053
            name: Spearman Dot
          - type: pearson_max
            value: 0.8591861535833625
            name: Pearson Max
          - type: spearman_max
            value: 0.8714341050301952
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 128
          type: sts-dev-128
        metrics:
          - type: pearson_cosine
            value: 0.8528583626543365
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8687502864484896
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8509433708242649
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.857615159782176
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8531616082767298
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8580823134153918
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.697019210549756
            name: Pearson Dot
          - type: spearman_dot
            value: 0.705924438927243
            name: Spearman Dot
          - type: pearson_max
            value: 0.8531616082767298
            name: Pearson Max
          - type: spearman_max
            value: 0.8687502864484896
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 64
          type: sts-dev-64
        metrics:
          - type: pearson_cosine
            value: 0.8340115410608493
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.858682843519445
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8351566362279711
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8445869885309296
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.838674217877368
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8460894143343873
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6579249229659768
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6712615573330701
            name: Spearman Dot
          - type: pearson_max
            value: 0.838674217877368
            name: Pearson Max
          - type: spearman_max
            value: 0.858682843519445
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 768
          type: sts-test-768
        metrics:
          - type: pearson_cosine
            value: 0.833720870548252
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8469501140979906
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8484755252691695
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8470024066861298
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8492651445573072
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8475238481800537
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6701649984837568
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6526285131648061
            name: Spearman Dot
          - type: pearson_max
            value: 0.8492651445573072
            name: Pearson Max
          - type: spearman_max
            value: 0.8475238481800537
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 512
          type: sts-test-512
        metrics:
          - type: pearson_cosine
            value: 0.8325595554355977
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8467500241650668
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8474378528408064
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8462571021525837
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.848182316243596
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8466275072216626
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6736686039338646
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6572299516736647
            name: Spearman Dot
          - type: pearson_max
            value: 0.848182316243596
            name: Pearson Max
          - type: spearman_max
            value: 0.8467500241650668
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 256
          type: sts-test-256
        metrics:
          - type: pearson_cosine
            value: 0.8225923032714455
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8403145699624681
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8420998942805191
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8419520394692916
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8434867831513
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8428522494561291
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6230179114374444
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6061595939729718
            name: Spearman Dot
          - type: pearson_max
            value: 0.8434867831513
            name: Pearson Max
          - type: spearman_max
            value: 0.8428522494561291
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 128
          type: sts-test-128
        metrics:
          - type: pearson_cosine
            value: 0.8149976807930366
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8349547446101432
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8351661617446753
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8360899024374612
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8375785243041524
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8375574347771609
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5958381414366161
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5793444545861678
            name: Spearman Dot
          - type: pearson_max
            value: 0.8375785243041524
            name: Pearson Max
          - type: spearman_max
            value: 0.8375574347771609
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 64
          type: sts-test-64
        metrics:
          - type: pearson_cosine
            value: 0.7981336004264228
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8269913105115189
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8238799955007295
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8289121477853545
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8278657744625194
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8314643517951371
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5206433480609991
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5067194535547845
            name: Spearman Dot
          - type: pearson_max
            value: 0.8278657744625194
            name: Pearson Max
          - type: spearman_max
            value: 0.8314643517951371
            name: Spearman Max

SentenceTransformer based on distilbert/distilbert-base-uncased

This is a sentence-transformers model finetuned from distilbert/distilbert-base-uncased on the sentence-transformers/stsb dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

Model Sources

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("tomaarsen/distilbert-base-uncased-sts-matryoshka")
# Run inference
sentences = [
    'A woman is dancing.',
    'A woman is dancing in railway station.',
    'The flag was moving in the air.',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Semantic Similarity

Metric Value
pearson_cosine 0.8648
spearman_cosine 0.8748
pearson_manhattan 0.8628
spearman_manhattan 0.8658
pearson_euclidean 0.8627
spearman_euclidean 0.8658
pearson_dot 0.7443
spearman_dot 0.7514
pearson_max 0.8648
spearman_max 0.8748

Semantic Similarity

Metric Value
pearson_cosine 0.8628
spearman_cosine 0.8741
pearson_manhattan 0.862
spearman_manhattan 0.8651
pearson_euclidean 0.8623
spearman_euclidean 0.8653
pearson_dot 0.7464
spearman_dot 0.7541
pearson_max 0.8628
spearman_max 0.8741

Semantic Similarity

Metric Value
pearson_cosine 0.8589
spearman_cosine 0.8714
pearson_manhattan 0.8591
spearman_manhattan 0.8634
pearson_euclidean 0.8592
spearman_euclidean 0.8629
pearson_dot 0.7186
spearman_dot 0.7289
pearson_max 0.8592
spearman_max 0.8714

Semantic Similarity

Metric Value
pearson_cosine 0.8529
spearman_cosine 0.8688
pearson_manhattan 0.8509
spearman_manhattan 0.8576
pearson_euclidean 0.8532
spearman_euclidean 0.8581
pearson_dot 0.697
spearman_dot 0.7059
pearson_max 0.8532
spearman_max 0.8688

Semantic Similarity

Metric Value
pearson_cosine 0.834
spearman_cosine 0.8587
pearson_manhattan 0.8352
spearman_manhattan 0.8446
pearson_euclidean 0.8387
spearman_euclidean 0.8461
pearson_dot 0.6579
spearman_dot 0.6713
pearson_max 0.8387
spearman_max 0.8587

Semantic Similarity

Metric Value
pearson_cosine 0.8337
spearman_cosine 0.847
pearson_manhattan 0.8485
spearman_manhattan 0.847
pearson_euclidean 0.8493
spearman_euclidean 0.8475
pearson_dot 0.6702
spearman_dot 0.6526
pearson_max 0.8493
spearman_max 0.8475

Semantic Similarity

Metric Value
pearson_cosine 0.8326
spearman_cosine 0.8468
pearson_manhattan 0.8474
spearman_manhattan 0.8463
pearson_euclidean 0.8482
spearman_euclidean 0.8466
pearson_dot 0.6737
spearman_dot 0.6572
pearson_max 0.8482
spearman_max 0.8468

Semantic Similarity

Metric Value
pearson_cosine 0.8226
spearman_cosine 0.8403
pearson_manhattan 0.8421
spearman_manhattan 0.842
pearson_euclidean 0.8435
spearman_euclidean 0.8429
pearson_dot 0.623
spearman_dot 0.6062
pearson_max 0.8435
spearman_max 0.8429

Semantic Similarity

Metric Value
pearson_cosine 0.815
spearman_cosine 0.835
pearson_manhattan 0.8352
spearman_manhattan 0.8361
pearson_euclidean 0.8376
spearman_euclidean 0.8376
pearson_dot 0.5958
spearman_dot 0.5793
pearson_max 0.8376
spearman_max 0.8376

Semantic Similarity

Metric Value
pearson_cosine 0.7981
spearman_cosine 0.827
pearson_manhattan 0.8239
spearman_manhattan 0.8289
pearson_euclidean 0.8279
spearman_euclidean 0.8315
pearson_dot 0.5206
spearman_dot 0.5067
pearson_max 0.8279
spearman_max 0.8315

Training Details

Training Dataset

sentence-transformers/stsb

  • Dataset: sentence-transformers/stsb at ab7a5ac
  • Size: 5,749 training samples
  • Columns: sentence1, sentence2, and score
  • Approximate statistics based on the first 1000 samples:
    sentence1 sentence2 score
    type string string float
    details
    • min: 6 tokens
    • mean: 10.0 tokens
    • max: 28 tokens
    • min: 5 tokens
    • mean: 9.95 tokens
    • max: 25 tokens
    • min: 0.0
    • mean: 0.54
    • max: 1.0
  • Samples:
    sentence1 sentence2 score
    A plane is taking off. An air plane is taking off. 1.0
    A man is playing a large flute. A man is playing a flute. 0.76
    A man is spreading shreded cheese on a pizza. A man is spreading shredded cheese on an uncooked pizza. 0.76
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "CoSENTLoss",
        "matryoshka_dims": [
            768,
            512,
            256,
            128,
            64
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Evaluation Dataset

sentence-transformers/stsb

  • Dataset: sentence-transformers/stsb at ab7a5ac
  • Size: 1,500 evaluation samples
  • Columns: sentence1, sentence2, and score
  • Approximate statistics based on the first 1000 samples:
    sentence1 sentence2 score
    type string string float
    details
    • min: 5 tokens
    • mean: 15.1 tokens
    • max: 45 tokens
    • min: 6 tokens
    • mean: 15.11 tokens
    • max: 53 tokens
    • min: 0.0
    • mean: 0.47
    • max: 1.0
  • Samples:
    sentence1 sentence2 score
    A man with a hard hat is dancing. A man wearing a hard hat is dancing. 1.0
    A young child is riding a horse. A child is riding a horse. 0.95
    A man is feeding a mouse to a snake. The man is feeding a mouse to the snake. 1.0
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "CoSENTLoss",
        "matryoshka_dims": [
            768,
            512,
            256,
            128,
            64
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Training Hyperparameters

Non-Default Hyperparameters

  • eval_strategy: steps
  • per_device_train_batch_size: 16
  • per_device_eval_batch_size: 16
  • num_train_epochs: 4
  • warmup_ratio: 0.1
  • fp16: True

All Hyperparameters

Click to expand
  • overwrite_output_dir: False
  • do_predict: False
  • eval_strategy: steps
  • prediction_loss_only: False
  • per_device_train_batch_size: 16
  • per_device_eval_batch_size: 16
  • per_gpu_train_batch_size: None
  • per_gpu_eval_batch_size: None
  • gradient_accumulation_steps: 1
  • eval_accumulation_steps: None
  • learning_rate: 5e-05
  • weight_decay: 0.0
  • adam_beta1: 0.9
  • adam_beta2: 0.999
  • adam_epsilon: 1e-08
  • max_grad_norm: 1.0
  • num_train_epochs: 4
  • max_steps: -1
  • lr_scheduler_type: linear
  • lr_scheduler_kwargs: {}
  • warmup_ratio: 0.1
  • warmup_steps: 0
  • log_level: passive
  • log_level_replica: warning
  • log_on_each_node: True
  • logging_nan_inf_filter: True
  • save_safetensors: True
  • save_on_each_node: False
  • save_only_model: False
  • no_cuda: False
  • use_cpu: False
  • use_mps_device: False
  • seed: 42
  • data_seed: None
  • jit_mode_eval: False
  • use_ipex: False
  • bf16: False
  • fp16: True
  • fp16_opt_level: O1
  • half_precision_backend: auto
  • bf16_full_eval: False
  • fp16_full_eval: False
  • tf32: None
  • local_rank: 0
  • ddp_backend: None
  • tpu_num_cores: None
  • tpu_metrics_debug: False
  • debug: []
  • dataloader_drop_last: False
  • dataloader_num_workers: 0
  • dataloader_prefetch_factor: None
  • past_index: -1
  • disable_tqdm: False
  • remove_unused_columns: True
  • label_names: None
  • load_best_model_at_end: False
  • ignore_data_skip: False
  • fsdp: []
  • fsdp_min_num_params: 0
  • fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
  • fsdp_transformer_layer_cls_to_wrap: None
  • accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
  • deepspeed: None
  • label_smoothing_factor: 0.0
  • optim: adamw_torch
  • optim_args: None
  • adafactor: False
  • group_by_length: False
  • length_column_name: length
  • ddp_find_unused_parameters: None
  • ddp_bucket_cap_mb: None
  • ddp_broadcast_buffers: None
  • dataloader_pin_memory: True
  • dataloader_persistent_workers: False
  • skip_memory_metrics: True
  • use_legacy_prediction_loop: False
  • push_to_hub: False
  • resume_from_checkpoint: None
  • hub_model_id: None
  • hub_strategy: every_save
  • hub_private_repo: False
  • hub_always_push: False
  • gradient_checkpointing: False
  • gradient_checkpointing_kwargs: None
  • include_inputs_for_metrics: False
  • eval_do_concat_batches: True
  • fp16_backend: auto
  • push_to_hub_model_id: None
  • push_to_hub_organization: None
  • mp_parameters:
  • auto_find_batch_size: False
  • full_determinism: False
  • torchdynamo: None
  • ray_scope: last
  • ddp_timeout: 1800
  • torch_compile: False
  • torch_compile_backend: None
  • torch_compile_mode: None
  • dispatch_batches: None
  • split_batches: None
  • include_tokens_per_second: False
  • include_num_input_tokens_seen: False
  • neftune_noise_alpha: None
  • optim_target_modules: None
  • batch_sampler: batch_sampler
  • multi_dataset_batch_sampler: proportional

Training Logs

Epoch Step Training Loss loss sts-dev-128_spearman_cosine sts-dev-256_spearman_cosine sts-dev-512_spearman_cosine sts-dev-64_spearman_cosine sts-dev-768_spearman_cosine sts-test-128_spearman_cosine sts-test-256_spearman_cosine sts-test-512_spearman_cosine sts-test-64_spearman_cosine sts-test-768_spearman_cosine
0.2778 100 23.266 21.5517 0.8305 0.8355 0.8361 0.8157 0.8366 - - - - -
0.5556 200 21.8736 21.6172 0.8327 0.8388 0.8446 0.8206 0.8453 - - - - -
0.8333 300 21.6241 22.0565 0.8475 0.8538 0.8556 0.8345 0.8565 - - - - -
1.1111 400 21.075 23.6719 0.8545 0.8581 0.8634 0.8435 0.8644 - - - - -
1.3889 500 20.4122 22.5926 0.8592 0.8624 0.8650 0.8436 0.8656 - - - - -
1.6667 600 20.6586 22.5999 0.8514 0.8563 0.8595 0.8389 0.8597 - - - - -
1.9444 700 20.3262 22.2965 0.8582 0.8631 0.8666 0.8465 0.8667 - - - - -
2.2222 800 19.7948 23.1844 0.8621 0.8659 0.8688 0.8499 0.8694 - - - - -
2.5 900 19.2826 23.1351 0.8653 0.8687 0.8703 0.8547 0.8710 - - - - -
2.7778 1000 19.1063 23.7141 0.8641 0.8672 0.8691 0.8531 0.8695 - - - - -
3.0556 1100 19.4575 23.0055 0.8673 0.8702 0.8726 0.8574 0.8728 - - - - -
3.3333 1200 18.0727 24.9288 0.8659 0.8692 0.8715 0.8565 0.8722 - - - - -
3.6111 1300 18.1698 25.3114 0.8675 0.8701 0.8728 0.8576 0.8734 - - - - -
3.8889 1400 18.2321 25.3777 0.8688 0.8714 0.8741 0.8587 0.8748 - - - - -
4.0 1440 - - - - - - - 0.8350 0.8403 0.8468 0.8270 0.8470

Environmental Impact

Carbon emissions were measured using CodeCarbon.

  • Energy Consumed: 0.020 kWh
  • Carbon Emitted: 0.008 kg of CO2
  • Hours Used: 0.112 hours

Training Hardware

  • On Cloud: No
  • GPU Model: 1 x NVIDIA GeForce RTX 3090
  • CPU Model: 13th Gen Intel(R) Core(TM) i7-13700K
  • RAM Size: 31.78 GB

Framework Versions

  • Python: 3.11.6
  • Sentence Transformers: 3.0.0.dev0
  • Transformers: 4.41.0.dev0
  • PyTorch: 2.3.0+cu121
  • Accelerate: 0.26.1
  • Datasets: 2.18.0
  • Tokenizers: 0.19.1

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MatryoshkaLoss

@misc{kusupati2024matryoshka,
    title={Matryoshka Representation Learning}, 
    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
    year={2024},
    eprint={2205.13147},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

CoSENTLoss

@online{kexuefm-8847,
    title={CoSENT: A more efficient sentence vector scheme than Sentence-BERT},
    author={Su Jianlin},
    year={2022},
    month={Jan},
    url={https://kexue.fm/archives/8847},
}