Kaguya-19
commited on
Commit
•
143fca0
1
Parent(s):
09daf17
fit for sentence transformers
Browse files- 1_Pool/config.json +10 -0
- README.md +31 -5
- config.json +1 -1
- config_sentence_transformers.json +9 -0
- configuration.json +1 -0
- modeling_minicpm.py +9 -0
- modules.json +14 -0
1_Pool/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 2304,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": false
|
10 |
+
}
|
README.md
CHANGED
@@ -347,6 +347,7 @@ flash-attn>2.3.5
|
|
347 |
|
348 |
### 示例脚本 Demo
|
349 |
|
|
|
350 |
```python
|
351 |
|
352 |
from transformers import AutoModel, AutoTokenizer
|
@@ -358,10 +359,11 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
358 |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
|
359 |
model.eval()
|
360 |
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
|
|
365 |
reps = s / d
|
366 |
return reps
|
367 |
|
@@ -373,7 +375,7 @@ def encode(input_texts):
|
|
373 |
attention_mask = batch_dict["attention_mask"]
|
374 |
hidden = outputs.last_hidden_state
|
375 |
|
376 |
-
reps =
|
377 |
embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
|
378 |
return embeddings
|
379 |
|
@@ -391,6 +393,30 @@ scores = (embeddings_query @ embeddings_doc.T)
|
|
391 |
print(scores.tolist()) # [[0.3535913825035095, 0.18596848845481873]]
|
392 |
```
|
393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
## 实验结果 Evaluation Results
|
395 |
|
396 |
### 中文与英文检索结果 CN/EN Retrieval Results
|
|
|
347 |
|
348 |
### 示例脚本 Demo
|
349 |
|
350 |
+
#### Huggingface Transformers
|
351 |
```python
|
352 |
|
353 |
from transformers import AutoModel, AutoTokenizer
|
|
|
359 |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
|
360 |
model.eval()
|
361 |
|
362 |
+
# 事实上我们用的是weighted mean pooling,但为了部署方便,我们将一部分pooling步骤集成在model.forward中
|
363 |
+
# In fact, we will use weighted mean pooling, but we will integrate some pooling steps into model.forward for deployment convenience
|
364 |
+
def mean_pooling(hidden,attention_mask):
|
365 |
+
s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
|
366 |
+
d = attention_mask.sum(dim=1, keepdim=True).float()
|
367 |
reps = s / d
|
368 |
return reps
|
369 |
|
|
|
375 |
attention_mask = batch_dict["attention_mask"]
|
376 |
hidden = outputs.last_hidden_state
|
377 |
|
378 |
+
reps = mean_pooling(hidden, attention_mask)
|
379 |
embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
|
380 |
return embeddings
|
381 |
|
|
|
393 |
print(scores.tolist()) # [[0.3535913825035095, 0.18596848845481873]]
|
394 |
```
|
395 |
|
396 |
+
#### Sentence Transformers
|
397 |
+
|
398 |
+
```python
|
399 |
+
import torch
|
400 |
+
from sentence_transformers import SentenceTransformer
|
401 |
+
|
402 |
+
model_name = "openbmb/MiniCPM-Embedding"
|
403 |
+
model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={"attn_implementation":"flash_attention_2", "torch_dtype":torch.float16})
|
404 |
+
model.max_seq_length = 512
|
405 |
+
model.tokenizer.padding_side="right"
|
406 |
+
|
407 |
+
queries = ["中国的首都是哪里?"]
|
408 |
+
passages = ["beijing", "shanghai"]
|
409 |
+
|
410 |
+
|
411 |
+
INSTRUCTION = "Query: "
|
412 |
+
|
413 |
+
embeddings_query = model.encode(queries, prompt=INSTRUCTION, normalize_embeddings=True)
|
414 |
+
embeddings_doc = model.encode(passages, normalize_embeddings=True)
|
415 |
+
|
416 |
+
scores = (embeddings_query @ embeddings_doc.T)
|
417 |
+
print(scores.tolist()) # [[0.3535913825035095, 0.18596848845481873]]
|
418 |
+
```
|
419 |
+
|
420 |
## 实验结果 Evaluation Results
|
421 |
|
422 |
### 中文与英文检索结果 CN/EN Retrieval Results
|
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "openbmb/
|
3 |
"architectures": [
|
4 |
"MiniCPM"
|
5 |
],
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "openbmb/MiniCPM-Embedding",
|
3 |
"architectures": [
|
4 |
"MiniCPM"
|
5 |
],
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "2.7.0",
|
4 |
+
"transformers": "4.37.2",
|
5 |
+
"pytorch": "2.0.1+cu121"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null
|
9 |
+
}
|
configuration.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"task":"sentence-embedding"}
|
modeling_minicpm.py
CHANGED
@@ -1043,6 +1043,8 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
1043 |
if inputs_embeds is None:
|
1044 |
inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
|
1045 |
|
|
|
|
|
1046 |
if self._use_flash_attention_2:
|
1047 |
# 2d mask is passed through the layers
|
1048 |
attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
|
@@ -1107,6 +1109,13 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
1107 |
if output_hidden_states:
|
1108 |
all_hidden_states += (hidden_states,)
|
1109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1110 |
next_cache = None
|
1111 |
if use_cache:
|
1112 |
next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
|
|
|
1043 |
if inputs_embeds is None:
|
1044 |
inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
|
1045 |
|
1046 |
+
_attention_mask = attention_mask
|
1047 |
+
|
1048 |
if self._use_flash_attention_2:
|
1049 |
# 2d mask is passed through the layers
|
1050 |
attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
|
|
|
1109 |
if output_hidden_states:
|
1110 |
all_hidden_states += (hidden_states,)
|
1111 |
|
1112 |
+
# gen weight before mean pooling
|
1113 |
+
attention_mask_ = _attention_mask * _attention_mask.cumsum(dim=1)
|
1114 |
+
s = hidden_states * attention_mask_.unsqueeze(-1).float()
|
1115 |
+
d = attention_mask_.sum(dim=1, keepdim=True).unsqueeze(1).float() /_attention_mask.sum(dim=1, keepdim=True).unsqueeze(1).float()
|
1116 |
+
|
1117 |
+
hidden_states = s / d
|
1118 |
+
|
1119 |
next_cache = None
|
1120 |
if use_cache:
|
1121 |
next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
|
modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
}
|
14 |
+
]
|