fit for sentence transformers

Browse files

Files changed (7) hide show

1_Pool/config.json +10 -0
README.md +31 -5
config.json +1 -1
config_sentence_transformers.json +9 -0
configuration.json +1 -0
modeling_minicpm.py +9 -0
modules.json +14 -0

1_Pool/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "word_embedding_dimension": 2304,
+    "pooling_mode_cls_token": false,
+    "pooling_mode_mean_tokens": true,
+    "pooling_mode_max_tokens": false,
+    "pooling_mode_mean_sqrt_len_tokens": false,
+    "pooling_mode_weightedmean_tokens": false,
+    "pooling_mode_lasttoken": false,
+    "include_prompt": false
+  }

README.md CHANGED Viewed

@@ -347,6 +347,7 @@ flash-attn>2.3.5
 ### 示例脚本 Demo
 ```python
 from transformers import AutoModel, AutoTokenizer
@@ -358,10 +359,11 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
 model.eval()
-def weighted_mean_pooling(hidden, attention_mask):
-    attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)
-    s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
-    d = attention_mask_.sum(dim=1, keepdim=True).float()
     reps = s / d
     return reps
@@ -373,7 +375,7 @@ def encode(input_texts):
     attention_mask = batch_dict["attention_mask"]
     hidden = outputs.last_hidden_state
-    reps = weighted_mean_pooling(hidden, attention_mask)
     embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
     return embeddings
@@ -391,6 +393,30 @@ scores = (embeddings_query @ embeddings_doc.T)
 print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]
 ```
 ## 实验结果 Evaluation Results
 ### 中文与英文检索结果 CN/EN Retrieval Results

 ### 示例脚本 Demo
+#### Huggingface Transformers
 ```python
 from transformers import AutoModel, AutoTokenizer
 model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
 model.eval()
+# 事实上我们用的是weighted mean pooling，但为了部署方便，我们将一部分pooling步骤集成在model.forward中
+# In fact, we will use weighted mean pooling, but we will integrate some pooling steps into model.forward for deployment convenience
+def mean_pooling(hidden,attention_mask):
+    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
+    d = attention_mask.sum(dim=1, keepdim=True).float()
     reps = s / d
     return reps
     attention_mask = batch_dict["attention_mask"]
     hidden = outputs.last_hidden_state
+    reps = mean_pooling(hidden, attention_mask)
     embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
     return embeddings
 print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]
 ```
+#### Sentence Transformers
+```python
+import torch
+from sentence_transformers import SentenceTransformer
+model_name = "openbmb/MiniCPM-Embedding"
+model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={"attn_implementation":"flash_attention_2", "torch_dtype":torch.float16})
+model.max_seq_length = 512
+model.tokenizer.padding_side="right"
+queries = ["中国的首都是哪里？"]
+passages = ["beijing", "shanghai"]
+INSTRUCTION = "Query: "
+embeddings_query = model.encode(queries, prompt=INSTRUCTION, normalize_embeddings=True)
+embeddings_doc = model.encode(passages, normalize_embeddings=True)
+scores = (embeddings_query @ embeddings_doc.T)
+print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]
+```
 ## 实验结果 Evaluation Results
 ### 中文与英文检索结果 CN/EN Retrieval Results

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "openbmb/RankCPM-E",
   "architectures": [
       "MiniCPM"
   ],

 {
+  "_name_or_path": "openbmb/MiniCPM-Embedding",
   "architectures": [
       "MiniCPM"
   ],

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "__version__": {
+      "sentence_transformers": "2.7.0",
+      "transformers": "4.37.2",
+      "pytorch": "2.0.1+cu121"
+    },
+    "prompts": {},
+    "default_prompt_name": null
+  }

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"task":"sentence-embedding"}

modeling_minicpm.py CHANGED Viewed

@@ -1043,6 +1043,8 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
@@ -1107,6 +1109,13 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
         next_cache = None
         if use_cache:
             next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache

         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
+        _attention_mask = attention_mask
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
+        # gen weight before mean pooling
+        attention_mask_ = _attention_mask * _attention_mask.cumsum(dim=1)
+        s = hidden_states * attention_mask_.unsqueeze(-1).float()
+        d = attention_mask_.sum(dim=1, keepdim=True).unsqueeze(1).float() /_attention_mask.sum(dim=1, keepdim=True).unsqueeze(1).float()
+        hidden_states = s / d
         next_cache = None
         if use_cache:
             next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache

modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+    {
+      "idx": 0,
+      "name": "0",
+      "path": "",
+      "type": "sentence_transformers.models.Transformer"
+    },
+    {
+      "idx": 1,
+      "name": "1",
+      "path": "1_Pooling",
+      "type": "sentence_transformers.models.Pooling"
+    }
+  ]