Update README.md
Browse files
README.md
CHANGED
@@ -40,10 +40,19 @@ from transformers import AutoModel, AutoTokenizer
|
|
40 |
model_name="jgrosjean-mathesis/swissbert-for-sentence-embeddings"
|
41 |
model = AutoModel.from_pretrained(model_name)
|
42 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
43 |
-
model.set_default_language("de_CH")
|
44 |
-
|
45 |
-
def generate_sentence_embedding(sentence, ):
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
# Tokenize input sentence
|
48 |
inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
49 |
|
@@ -56,7 +65,7 @@ def generate_sentence_embedding(sentence, ):
|
|
56 |
|
57 |
return embedding
|
58 |
|
59 |
-
sentence_embedding = generate_sentence_embedding("Wir feiern am 1. August den Schweizer Nationalfeiertag.")
|
60 |
print(sentence_embedding)
|
61 |
```
|
62 |
Output:
|
@@ -67,6 +76,26 @@ tensor([[ 5.6306e-02, -2.8375e-01, -4.1495e-02, 7.4393e-02, -3.1552e-01,
|
|
67 |
...]])
|
68 |
```
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
## Bias, Risks, and Limitations
|
71 |
|
72 |
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
@@ -162,18 +191,6 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
|
|
162 |
|
163 |
[More Information Needed]
|
164 |
|
165 |
-
### Compute Infrastructure
|
166 |
-
|
167 |
-
[More Information Needed]
|
168 |
-
|
169 |
-
#### Hardware
|
170 |
-
|
171 |
-
[More Information Needed]
|
172 |
-
|
173 |
-
#### Software
|
174 |
-
|
175 |
-
[More Information Needed]
|
176 |
-
|
177 |
## Citation [optional]
|
178 |
|
179 |
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
|
|
40 |
model_name="jgrosjean-mathesis/swissbert-for-sentence-embeddings"
|
41 |
model = AutoModel.from_pretrained(model_name)
|
42 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
|
|
|
43 |
|
44 |
+
def generate_sentence_embedding(sentence, language):
|
45 |
+
|
46 |
+
# Set adapter to specified language
|
47 |
+
if "de" in language:
|
48 |
+
model.set_default_language("de_CH")
|
49 |
+
if "fr" in language:
|
50 |
+
model.set_default_language("fr_CH")
|
51 |
+
if "it" in language:
|
52 |
+
model.set_default_language("it_CH")
|
53 |
+
if "rm" in language:
|
54 |
+
model.set_default_language("rm_CH")
|
55 |
+
|
56 |
# Tokenize input sentence
|
57 |
inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
58 |
|
|
|
65 |
|
66 |
return embedding
|
67 |
|
68 |
+
sentence_embedding = generate_sentence_embedding("Wir feiern am 1. August den Schweizer Nationalfeiertag.", language="de")
|
69 |
print(sentence_embedding)
|
70 |
```
|
71 |
Output:
|
|
|
76 |
...]])
|
77 |
```
|
78 |
|
79 |
+
### Semantic Textual Similarity
|
80 |
+
|
81 |
+
```python
|
82 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
83 |
+
|
84 |
+
# Define two sentences
|
85 |
+
sentence_1 = ["Der Zug kommt um 9 Uhr in Zürich an."]
|
86 |
+
sentence_2 = ["Le train arrive à Lausanne à 9h."]
|
87 |
+
|
88 |
+
#Compute embedding for both
|
89 |
+
embedding_1 = generate_sentence_embedding(sentence_1, language="de")
|
90 |
+
embedding_2 = generate_sentence_embedding(sentence_2, language="fr")
|
91 |
+
|
92 |
+
#Compute cosine-similarity
|
93 |
+
cosine_score = cosine_similarity((embedding_1, embedding_2)
|
94 |
+
|
95 |
+
#Output the score
|
96 |
+
print("The cosine score for", sentence_1, "and", sentence_2, "is", cosine_score)
|
97 |
+
```
|
98 |
+
|
99 |
## Bias, Risks, and Limitations
|
100 |
|
101 |
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
|
|
191 |
|
192 |
[More Information Needed]
|
193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
## Citation [optional]
|
195 |
|
196 |
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|