Mollel's picture
Add new SentenceTransformer model.
5b0763d verified
|
raw
history blame
No virus
59 kB
metadata
language: []
library_name: sentence-transformers
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - generated_from_trainer
  - dataset_size:1115700
  - loss:MatryoshkaLoss
  - loss:MultipleNegativesRankingLoss
base_model: Geotrend/bert-base-sw-cased
datasets: []
metrics:
  - pearson_cosine
  - spearman_cosine
  - pearson_manhattan
  - spearman_manhattan
  - pearson_euclidean
  - spearman_euclidean
  - pearson_dot
  - spearman_dot
  - pearson_max
  - spearman_max
widget:
  - source_sentence: Ndege mwenye mdomo mrefu katikati ya ndege.
    sentences:
      - Panya anayekimbia juu ya gurudumu.
      - Mtu anashindana katika mashindano ya mbio.
      - Ndege anayeruka.
  - source_sentence: >-
      Msichana mchanga mwenye nywele nyeusi anakabili kamera na kushikilia mfuko
      wa karatasi wakati amevaa shati la machungwa na mabawa ya kipepeo yenye
      rangi nyingi.
    sentences:
      - Mwanamke mzee anakataa kupigwa picha.
      - mtu akila na mvulana mdogo kwenye kijia cha jiji
      - Msichana mchanga anakabili kamera.
  - source_sentence: >-
      Wanawake na watoto wameketi nje katika kivuli wakati kikundi cha watoto
      wadogo wameketi ndani katika kivuli.
    sentences:
      - Mwanamke na watoto na kukaa chini.
      - Mwanamke huyo anakimbia.
      - Watu wanasafiri kwa baiskeli.
  - source_sentence: >-
      Mtoto mdogo anaruka mikononi mwa mwanamke aliyevalia suti nyeusi ya
      kuogelea akiwa kwenye dimbwi.
    sentences:
      - >-
        Mtoto akiruka mikononi mwa mwanamke aliyevalia suti ya kuogelea kwenye
        dimbwi.
      - Someone is holding oranges and walking
      - Mama na binti wakinunua viatu.
  - source_sentence: >-
      Mwanamume na mwanamke wachanga waliovaa mikoba wanaweka au kuondoa kitu
      kutoka kwenye mti mweupe wa zamani, huku watu wengine wamesimama au
      wameketi nyuma.
    sentences:
      - tai huruka
      - mwanamume na mwanamke wenye mikoba
      - Wanaume wawili wameketi karibu na mwanamke.
pipeline_tag: sentence-similarity
model-index:
  - name: SentenceTransformer based on Geotrend/bert-base-sw-cased
    results:
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 768
          type: sts-test-768
        metrics:
          - type: pearson_cosine
            value: 0.6937245827269046
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.6872564222432196
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.6671541268726737
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.6578428252987948
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.6672292642346008
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.6577692881532263
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5234944445417878
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5126395384896926
            name: Spearman Dot
          - type: pearson_max
            value: 0.6937245827269046
            name: Pearson Max
          - type: spearman_max
            value: 0.6872564222432196
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 512
          type: sts-test-512
        metrics:
          - type: pearson_cosine
            value: 0.689885399601221
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.6847071916895495
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.6678379220949281
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.6579957115799916
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.6673062843667007
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.6573006123381013
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.49533316366864977
            name: Pearson Dot
          - type: spearman_dot
            value: 0.48723679408818543
            name: Spearman Dot
          - type: pearson_max
            value: 0.689885399601221
            name: Pearson Max
          - type: spearman_max
            value: 0.6847071916895495
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 256
          type: sts-test-256
        metrics:
          - type: pearson_cosine
            value: 0.6873377612773459
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.6816874105466478
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.667357515297651
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.6557727891191705
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.6674937201647584
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.6560441259953166
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.45660372834373963
            name: Pearson Dot
          - type: spearman_dot
            value: 0.4533070407260065
            name: Spearman Dot
          - type: pearson_max
            value: 0.6873377612773459
            name: Pearson Max
          - type: spearman_max
            value: 0.6816874105466478
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 128
          type: sts-test-128
        metrics:
          - type: pearson_cosine
            value: 0.6836009506667413
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.6795423695973911
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.6663652896396122
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.6534731725514219
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.6663726876345561
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.6537216014002204
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.43102957451470686
            name: Pearson Dot
          - type: spearman_dot
            value: 0.431538008932168
            name: Spearman Dot
          - type: pearson_max
            value: 0.6836009506667413
            name: Pearson Max
          - type: spearman_max
            value: 0.6795423695973911
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 64
          type: sts-test-64
        metrics:
          - type: pearson_cosine
            value: 0.6715253560367674
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.669070001537953
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.6571390159051358
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.6456119247619697
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.6598587843081631
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.6472279949159918
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.36757468941627225
            name: Pearson Dot
          - type: spearman_dot
            value: 0.3678274698380672
            name: Spearman Dot
          - type: pearson_max
            value: 0.6715253560367674
            name: Pearson Max
          - type: spearman_max
            value: 0.669070001537953
            name: Spearman Max

SentenceTransformer based on Geotrend/bert-base-sw-cased

This is a sentence-transformers model finetuned from Geotrend/bert-base-sw-cased on the Mollel/swahili-n_li-triplet-swh-eng dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

  • Model Type: Sentence Transformer
  • Base model: Geotrend/bert-base-sw-cased
  • Maximum Sequence Length: 512 tokens
  • Output Dimensionality: 768 tokens
  • Similarity Function: Cosine Similarity
  • Training Dataset:
    • Mollel/swahili-n_li-triplet-swh-eng

Model Sources

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("Mollel/MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka")
# Run inference
sentences = [
    'Mwanamume na mwanamke wachanga waliovaa mikoba wanaweka au kuondoa kitu kutoka kwenye mti mweupe wa zamani, huku watu wengine wamesimama au wameketi nyuma.',
    'mwanamume na mwanamke wenye mikoba',
    'tai huruka',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Semantic Similarity

Metric Value
pearson_cosine 0.6937
spearman_cosine 0.6873
pearson_manhattan 0.6672
spearman_manhattan 0.6578
pearson_euclidean 0.6672
spearman_euclidean 0.6578
pearson_dot 0.5235
spearman_dot 0.5126
pearson_max 0.6937
spearman_max 0.6873

Semantic Similarity

Metric Value
pearson_cosine 0.6899
spearman_cosine 0.6847
pearson_manhattan 0.6678
spearman_manhattan 0.658
pearson_euclidean 0.6673
spearman_euclidean 0.6573
pearson_dot 0.4953
spearman_dot 0.4872
pearson_max 0.6899
spearman_max 0.6847

Semantic Similarity

Metric Value
pearson_cosine 0.6873
spearman_cosine 0.6817
pearson_manhattan 0.6674
spearman_manhattan 0.6558
pearson_euclidean 0.6675
spearman_euclidean 0.656
pearson_dot 0.4566
spearman_dot 0.4533
pearson_max 0.6873
spearman_max 0.6817

Semantic Similarity

Metric Value
pearson_cosine 0.6836
spearman_cosine 0.6795
pearson_manhattan 0.6664
spearman_manhattan 0.6535
pearson_euclidean 0.6664
spearman_euclidean 0.6537
pearson_dot 0.431
spearman_dot 0.4315
pearson_max 0.6836
spearman_max 0.6795

Semantic Similarity

Metric Value
pearson_cosine 0.6715
spearman_cosine 0.6691
pearson_manhattan 0.6571
spearman_manhattan 0.6456
pearson_euclidean 0.6599
spearman_euclidean 0.6472
pearson_dot 0.3676
spearman_dot 0.3678
pearson_max 0.6715
spearman_max 0.6691

Training Details

Training Dataset

Mollel/swahili-n_li-triplet-swh-eng

  • Dataset: Mollel/swahili-n_li-triplet-swh-eng
  • Size: 1,115,700 training samples
  • Columns: anchor, positive, and negative
  • Approximate statistics based on the first 1000 samples:
    anchor positive negative
    type string string string
    details
    • min: 9 tokens
    • mean: 16.73 tokens
    • max: 71 tokens
    • min: 6 tokens
    • mean: 19.74 tokens
    • max: 45 tokens
    • min: 6 tokens
    • mean: 19.0 tokens
    • max: 49 tokens
  • Samples:
    anchor positive negative
    A person on a horse jumps over a broken down airplane. A person is outdoors, on a horse. A person is at a diner, ordering an omelette.
    Mtu aliyepanda farasi anaruka juu ya ndege iliyovunjika. Mtu yuko nje, juu ya farasi. Mtu yuko kwenye mkahawa, akiagiza omelette.
    Children smiling and waving at camera There are children present The kids are frowning
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "MultipleNegativesRankingLoss",
        "matryoshka_dims": [
            768,
            512,
            256,
            128,
            64
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Evaluation Dataset

Mollel/swahili-n_li-triplet-swh-eng

  • Dataset: Mollel/swahili-n_li-triplet-swh-eng
  • Size: 13,168 evaluation samples
  • Columns: anchor, positive, and negative
  • Approximate statistics based on the first 1000 samples:
    anchor positive negative
    type string string string
    details
    • min: 7 tokens
    • mean: 28.25 tokens
    • max: 82 tokens
    • min: 5 tokens
    • mean: 14.16 tokens
    • max: 55 tokens
    • min: 5 tokens
    • mean: 15.55 tokens
    • max: 46 tokens
  • Samples:
    anchor positive negative
    Two women are embracing while holding to go packages. Two woman are holding packages. The men are fighting outside a deli.
    Wanawake wawili wanakumbatiana huku wakishikilia vifurushi vya kwenda. Wanawake wawili wanashikilia vifurushi. Wanaume hao wanapigana nje ya duka la vyakula vitamu.
    Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink. Two kids in numbered jerseys wash their hands. Two kids in jackets walk to school.
  • Loss: MatryoshkaLoss with these parameters:
    {
        "loss": "MultipleNegativesRankingLoss",
        "matryoshka_dims": [
            768,
            512,
            256,
            128,
            64
        ],
        "matryoshka_weights": [
            1,
            1,
            1,
            1,
            1
        ],
        "n_dims_per_step": -1
    }
    

Training Hyperparameters

Non-Default Hyperparameters

  • per_device_train_batch_size: 32
  • per_device_eval_batch_size: 32
  • learning_rate: 2e-05
  • num_train_epochs: 1
  • warmup_ratio: 0.1
  • bf16: True
  • batch_sampler: no_duplicates

All Hyperparameters

Click to expand
  • overwrite_output_dir: False
  • do_predict: False
  • prediction_loss_only: True
  • per_device_train_batch_size: 32
  • per_device_eval_batch_size: 32
  • per_gpu_train_batch_size: None
  • per_gpu_eval_batch_size: None
  • gradient_accumulation_steps: 1
  • eval_accumulation_steps: None
  • learning_rate: 2e-05
  • weight_decay: 0.0
  • adam_beta1: 0.9
  • adam_beta2: 0.999
  • adam_epsilon: 1e-08
  • max_grad_norm: 1.0
  • num_train_epochs: 1
  • max_steps: -1
  • lr_scheduler_type: linear
  • lr_scheduler_kwargs: {}
  • warmup_ratio: 0.1
  • warmup_steps: 0
  • log_level: passive
  • log_level_replica: warning
  • log_on_each_node: True
  • logging_nan_inf_filter: True
  • save_safetensors: True
  • save_on_each_node: False
  • save_only_model: False
  • no_cuda: False
  • use_cpu: False
  • use_mps_device: False
  • seed: 42
  • data_seed: None
  • jit_mode_eval: False
  • use_ipex: False
  • bf16: True
  • fp16: False
  • fp16_opt_level: O1
  • half_precision_backend: auto
  • bf16_full_eval: False
  • fp16_full_eval: False
  • tf32: None
  • local_rank: 0
  • ddp_backend: None
  • tpu_num_cores: None
  • tpu_metrics_debug: False
  • debug: []
  • dataloader_drop_last: False
  • dataloader_num_workers: 0
  • dataloader_prefetch_factor: None
  • past_index: -1
  • disable_tqdm: False
  • remove_unused_columns: True
  • label_names: None
  • load_best_model_at_end: False
  • ignore_data_skip: False
  • fsdp: []
  • fsdp_min_num_params: 0
  • fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
  • fsdp_transformer_layer_cls_to_wrap: None
  • accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}
  • deepspeed: None
  • label_smoothing_factor: 0.0
  • optim: adamw_torch
  • optim_args: None
  • adafactor: False
  • group_by_length: False
  • length_column_name: length
  • ddp_find_unused_parameters: None
  • ddp_bucket_cap_mb: None
  • ddp_broadcast_buffers: False
  • dataloader_pin_memory: True
  • dataloader_persistent_workers: False
  • skip_memory_metrics: True
  • use_legacy_prediction_loop: False
  • push_to_hub: False
  • resume_from_checkpoint: None
  • hub_model_id: None
  • hub_strategy: every_save
  • hub_private_repo: False
  • hub_always_push: False
  • gradient_checkpointing: False
  • gradient_checkpointing_kwargs: None
  • include_inputs_for_metrics: False
  • eval_do_concat_batches: True
  • fp16_backend: auto
  • push_to_hub_model_id: None
  • push_to_hub_organization: None
  • mp_parameters:
  • auto_find_batch_size: False
  • full_determinism: False
  • torchdynamo: None
  • ray_scope: last
  • ddp_timeout: 1800
  • torch_compile: False
  • torch_compile_backend: None
  • torch_compile_mode: None
  • dispatch_batches: None
  • split_batches: None
  • include_tokens_per_second: False
  • include_num_input_tokens_seen: False
  • neftune_noise_alpha: None
  • optim_target_modules: None
  • batch_sampler: no_duplicates
  • multi_dataset_batch_sampler: proportional

Training Logs

Click to expand
Epoch Step Training Loss sts-test-128_spearman_cosine sts-test-256_spearman_cosine sts-test-512_spearman_cosine sts-test-64_spearman_cosine sts-test-768_spearman_cosine
0.0057 100 19.9104 - - - - -
0.0115 200 15.4038 - - - - -
0.0172 300 12.4565 - - - - -
0.0229 400 11.8633 - - - - -
0.0287 500 11.0601 - - - - -
0.0344 600 9.7725 - - - - -
0.0402 700 8.8549 - - - - -
0.0459 800 8.0831 - - - - -
0.0516 900 7.9941 - - - - -
0.0574 1000 7.6537 - - - - -
0.0631 1100 7.9303 - - - - -
0.0688 1200 7.5246 - - - - -
0.0746 1300 7.7754 - - - - -
0.0803 1400 7.668 - - - - -
0.0860 1500 6.7171 - - - - -
0.0918 1600 6.347 - - - - -
0.0975 1700 6.0 - - - - -
0.1033 1800 6.4314 - - - - -
0.1090 1900 6.7947 - - - - -
0.1147 2000 6.9316 - - - - -
0.1205 2100 6.6304 - - - - -
0.1262 2200 6.132 - - - - -
0.1319 2300 5.8953 - - - - -
0.1377 2400 5.6954 - - - - -
0.1434 2500 5.6832 - - - - -
0.1491 2600 5.2266 - - - - -
0.1549 2700 5.0678 - - - - -
0.1606 2800 5.4733 - - - - -
0.1664 2900 6.0899 - - - - -
0.1721 3000 6.332 - - - - -
0.1778 3100 6.4937 - - - - -
0.1836 3200 6.2242 - - - - -
0.1893 3300 5.8023 - - - - -
0.1950 3400 5.0745 - - - - -
0.2008 3500 5.5806 - - - - -
0.2065 3600 5.5191 - - - - -
0.2122 3700 5.3849 - - - - -
0.2180 3800 5.4828 - - - - -
0.2237 3900 5.9982 - - - - -
0.2294 4000 5.6842 - - - - -
0.2352 4100 5.1627 - - - - -
0.2409 4200 5.154 - - - - -
0.2467 4300 5.7932 - - - - -
0.2524 4400 5.5758 - - - - -
0.2581 4500 5.5212 - - - - -
0.2639 4600 5.5692 - - - - -
0.2696 4700 5.2699 - - - - -
0.2753 4800 5.4919 - - - - -
0.2811 4900 5.0754 - - - - -
0.2868 5000 5.1514 - - - - -
0.2925 5100 5.0241 - - - - -
0.2983 5200 5.2679 - - - - -
0.3040 5300 5.3576 - - - - -
0.3098 5400 5.3454 - - - - -
0.3155 5500 5.2142 - - - - -
0.3212 5600 4.8418 - - - - -
0.3270 5700 4.9597 - - - - -
0.3327 5800 5.1989 - - - - -
0.3384 5900 5.2624 - - - - -
0.3442 6000 5.0705 - - - - -
0.3499 6100 5.232 - - - - -
0.3556 6200 5.2428 - - - - -
0.3614 6300 4.755 - - - - -
0.3671 6400 4.7266 - - - - -
0.3729 6500 4.6452 - - - - -
0.3786 6600 5.1431 - - - - -
0.3843 6700 4.5343 - - - - -
0.3901 6800 4.698 - - - - -
0.3958 6900 4.6944 - - - - -
0.4015 7000 4.6255 - - - - -
0.4073 7100 5.0211 - - - - -
0.4130 7200 4.6974 - - - - -
0.4187 7300 4.9182 - - - - -
0.4245 7400 4.652 - - - - -
0.4302 7500 5.1015 - - - - -
0.4360 7600 4.5249 - - - - -
0.4417 7700 4.455 - - - - -
0.4474 7800 4.8153 - - - - -
0.4532 7900 4.7665 - - - - -
0.4589 8000 4.3413 - - - - -
0.4646 8100 4.4697 - - - - -
0.4704 8200 4.6776 - - - - -
0.4761 8300 4.2868 - - - - -
0.4818 8400 4.7052 - - - - -
0.4876 8500 4.4721 - - - - -
0.4933 8600 4.6926 - - - - -
0.4991 8700 4.9891 - - - - -
0.5048 8800 4.4837 - - - - -
0.5105 8900 4.8127 - - - - -
0.5163 9000 4.3438 - - - - -
0.5220 9100 4.4743 - - - - -
0.5277 9200 4.6879 - - - - -
0.5335 9300 4.3593 - - - - -
0.5392 9400 4.3023 - - - - -
0.5449 9500 4.8188 - - - - -
0.5507 9600 4.6142 - - - - -
0.5564 9700 4.7679 - - - - -
0.5622 9800 4.6224 - - - - -
0.5679 9900 4.9154 - - - - -
0.5736 10000 4.7557 - - - - -
0.5794 10100 4.6395 - - - - -
0.5851 10200 4.7977 - - - - -
0.5908 10300 4.915 - - - - -
0.5966 10400 4.4854 - - - - -
0.6023 10500 4.3973 - - - - -
0.6080 10600 4.6964 - - - - -
0.6138 10700 4.8853 - - - - -
0.6195 10800 4.786 - - - - -
0.6253 10900 4.5482 - - - - -
0.6310 11000 4.4857 - - - - -
0.6367 11100 4.7415 - - - - -
0.6425 11200 4.2596 - - - - -
0.6482 11300 4.8578 - - - - -
0.6539 11400 4.5471 - - - - -
0.6597 11500 4.8337 - - - - -
0.6654 11600 4.2244 - - - - -
0.6711 11700 4.9619 - - - - -
0.6769 11800 4.9369 - - - - -
0.6826 11900 4.2697 - - - - -
0.6883 12000 4.2711 - - - - -
0.6941 12100 4.6396 - - - - -
0.6998 12200 4.5626 - - - - -
0.7056 12300 4.5767 - - - - -
0.7113 12400 4.6449 - - - - -
0.7170 12500 4.4217 - - - - -
0.7228 12600 4.0203 - - - - -
0.7285 12700 4.5381 - - - - -
0.7342 12800 4.5865 - - - - -
0.7400 12900 4.4203 - - - - -
0.7457 13000 4.3761 - - - - -
0.7514 13100 4.093 - - - - -
0.7572 13200 5.9235 - - - - -
0.7629 13300 5.4098 - - - - -
0.7687 13400 5.3079 - - - - -
0.7744 13500 5.0946 - - - - -
0.7801 13600 4.7098 - - - - -
0.7859 13700 4.9471 - - - - -
0.7916 13800 4.5742 - - - - -
0.7973 13900 4.6178 - - - - -
0.8031 14000 4.4516 - - - - -
0.8088 14100 4.429 - - - - -
0.8145 14200 4.3812 - - - - -
0.8203 14300 4.3739 - - - - -
0.8260 14400 4.3821 - - - - -
0.8318 14500 4.4396 - - - - -
0.8375 14600 4.2667 - - - - -
0.8432 14700 4.1963 - - - - -
0.8490 14800 4.1298 - - - - -
0.8547 14900 4.1843 - - - - -
0.8604 15000 4.0735 - - - - -
0.8662 15100 3.9319 - - - - -
0.8719 15200 4.1544 - - - - -
0.8776 15300 4.105 - - - - -
0.8834 15400 4.014 - - - - -
0.8891 15500 4.0345 - - - - -
0.8949 15600 3.9127 - - - - -
0.9006 15700 4.1002 - - - - -
0.9063 15800 3.8564 - - - - -
0.9121 15900 3.9297 - - - - -
0.9178 16000 3.8487 - - - - -
0.9235 16100 3.7099 - - - - -
0.9293 16200 3.8545 - - - - -
0.9350 16300 3.8122 - - - - -
0.9407 16400 3.8951 - - - - -
0.9465 16500 3.6996 - - - - -
0.9522 16600 3.9081 - - - - -
0.9580 16700 3.8603 - - - - -
0.9637 16800 3.8534 - - - - -
0.9694 16900 3.8145 - - - - -
0.9752 17000 3.9858 - - - - -
0.9809 17100 3.8224 - - - - -
0.9866 17200 3.7469 - - - - -
0.9924 17300 3.9066 - - - - -
0.9981 17400 3.6754 - - - - -
1.0 17433 - 0.6795 0.6817 0.6847 0.6691 0.6873

Framework Versions

  • Python: 3.11.9
  • Sentence Transformers: 3.0.1
  • Transformers: 4.40.1
  • PyTorch: 2.3.0+cu121
  • Accelerate: 0.29.3
  • Datasets: 2.19.0
  • Tokenizers: 0.19.1

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MatryoshkaLoss

@misc{kusupati2024matryoshka,
    title={Matryoshka Representation Learning}, 
    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
    year={2024},
    eprint={2205.13147},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

MultipleNegativesRankingLoss

@misc{henderson2017efficient,
    title={Efficient Natural Language Response Suggestion for Smart Reply}, 
    author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
    year={2017},
    eprint={1705.00652},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}