jgrosjean commited on
Commit
5b5aee3
1 Parent(s): aa3db3f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +10 -8
README.md CHANGED
@@ -33,7 +33,7 @@ import torch
33
  from transformers import AutoModel, AutoTokenizer
34
 
35
  # Load swissBERT for sentence embeddings model
36
- model_name="jgrosjean-mathesis/swissbert-for-sentence-embeddings"
37
  model = AutoModel.from_pretrained(model_name)
38
  tokenizer = AutoTokenizer.from_pretrained(model_name)
39
 
@@ -41,13 +41,13 @@ def generate_sentence_embedding(sentence, language):
41
 
42
  # Set adapter to specified language
43
  if "de" in language:
44
- model.set_default_language("de_CH")
45
  if "fr" in language:
46
- model.set_default_language("fr_CH")
47
  if "it" in language:
48
- model.set_default_language("it_CH")
49
  if "rm" in language:
50
- model.set_default_language("rm_CH")
51
 
52
  # Tokenize input sentence
53
  inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
@@ -86,11 +86,15 @@ embedding_1 = generate_sentence_embedding(sentence_1, language="de")
86
  embedding_2 = generate_sentence_embedding(sentence_2, language="fr")
87
 
88
  #Compute cosine-similarity
89
- cosine_score = cosine_similarity((embedding_1, embedding_2)
90
 
91
  #Output the score
92
  print("The cosine score for", sentence_1, "and", sentence_2, "is", cosine_score)
93
  ```
 
 
 
 
94
 
95
  ## Bias, Risks, and Limitations
96
 
@@ -123,8 +127,6 @@ Batch size: 512
123
 
124
  <!-- This section describes the evaluation protocols and provides the results. -->
125
 
126
- ### Testing Data, Factors & Metrics
127
-
128
  #### Baseline
129
 
130
  The first baseline is [distiluse-base-multilingual-cased](https://www.sbert.net/examples/training/multilingual/README.html), a high-performing Sentence Transformer model that is able to process German, French and Italian (and more).
 
33
  from transformers import AutoModel, AutoTokenizer
34
 
35
  # Load swissBERT for sentence embeddings model
36
+ model_name = "jgrosjean-mathesis/swissbert-for-sentence-embeddings"
37
  model = AutoModel.from_pretrained(model_name)
38
  tokenizer = AutoTokenizer.from_pretrained(model_name)
39
 
 
41
 
42
  # Set adapter to specified language
43
  if "de" in language:
44
+ model.set_default_language("de_CH")
45
  if "fr" in language:
46
+ model.set_default_language("fr_CH")
47
  if "it" in language:
48
+ model.set_default_language("it_CH")
49
  if "rm" in language:
50
+ model.set_default_language("rm_CH")
51
 
52
  # Tokenize input sentence
53
  inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
 
86
  embedding_2 = generate_sentence_embedding(sentence_2, language="fr")
87
 
88
  #Compute cosine-similarity
89
+ cosine_score = cosine_similarity(embedding_1, embedding_2)
90
 
91
  #Output the score
92
  print("The cosine score for", sentence_1, "and", sentence_2, "is", cosine_score)
93
  ```
94
+ Output:
95
+ ```
96
+ The cosine score for ['Der Zug kommt um 9 Uhr in Zürich an.'] and ['Le train arrive à Lausanne à 9h.'] is [[0.85555995]]
97
+ ```
98
 
99
  ## Bias, Risks, and Limitations
100
 
 
127
 
128
  <!-- This section describes the evaluation protocols and provides the results. -->
129
 
 
 
130
  #### Baseline
131
 
132
  The first baseline is [distiluse-base-multilingual-cased](https://www.sbert.net/examples/training/multilingual/README.html), a high-performing Sentence Transformer model that is able to process German, French and Italian (and more).