Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,48 @@
|
|
1 |
---
|
|
|
|
|
2 |
license: mit
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
language:
|
3 |
+
- zh
|
4 |
license: mit
|
5 |
---
|
6 |
+
|
7 |
+
# SimCSE(sup)
|
8 |
+
|
9 |
+
## Uses
|
10 |
+
You can use our model for encoding sentences into embeddings
|
11 |
+
```python
|
12 |
+
import torch
|
13 |
+
from transformers import BertTokenizer
|
14 |
+
from transformers import BertModel
|
15 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
16 |
+
|
17 |
+
# model
|
18 |
+
simcse_sup_path = "hellonlp/simcse-roberta-large-zh"
|
19 |
+
tokenizer = BertTokenizer.from_pretrained(simcse_sup_path)
|
20 |
+
MODEL = BertModel.from_pretrained(simcse_sup_path)
|
21 |
+
|
22 |
+
def get_vector_simcse(sentence):
|
23 |
+
"""
|
24 |
+
预测simcse的语义向量。
|
25 |
+
"""
|
26 |
+
input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
|
27 |
+
output = MODEL(input_ids)
|
28 |
+
return output.last_hidden_state[:, 0].squeeze(0)
|
29 |
+
|
30 |
+
embeddings = get_vector_simcse("武汉是一个美丽的城市。")
|
31 |
+
print(embeddings.shape)
|
32 |
+
#torch.Size([768])
|
33 |
+
```
|
34 |
+
|
35 |
+
You can also compute the cosine similarities between two sentences
|
36 |
+
```python
|
37 |
+
def get_similarity_two(sentence1, sentence2):
|
38 |
+
vec1 = get_vector_simcse(sentence1).tolist()
|
39 |
+
vec2 = get_vector_simcse(sentence2).tolist()
|
40 |
+
similarity_list = cosine_similarity([vec1], [vec2]).tolist()[0][0]
|
41 |
+
return similarity_list
|
42 |
+
|
43 |
+
sentence1 = '你好吗'
|
44 |
+
sentence2 = '你还好吗'
|
45 |
+
result = get_similarity_two(sentence1,sentence2)
|
46 |
+
print(result)
|
47 |
+
#0.848331
|
48 |
+
```
|