Introduce a custom Sentence Transformer module for smooth multi-modality (#1)

- Introduce custom Sentence Transformer module (9862f98edfbc3c5f1a56b1a00ef87ad1b9af3b76)
- Use self.max_seq_length to inform the maximum tokenize length (c0c6d64415a1e25865af6dbb702ac5ba5a1645e4)
- Merge branch 'main' into pr/1, resolve merge conflict (008f2574a5989c788b0fa395d4342a0e1c40f250)

Files changed (4) hide show

README.md +11 -74
custom_st.py +87 -0
modules.json +12 -6
sentence_bert_config.json +4 -1

README.md CHANGED Viewed

@@ -9004,66 +9004,10 @@ Actually, I've got first place on MTEB (Chinese and English), I will not release
 ## Usage
 ```python
-import functools
-import PIL
-import numpy as np
 import torch
-from typing import Dict
-from io import BytesIO
-from transformers import SiglipImageProcessor
 from sentence_transformers import SentenceTransformer
-def jasper_vl_forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
-    trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]}
-    if "pixel_values" in features:
-        trans_features["pixel_values"] = features["pixel_values"]
-    sentence_embedding = self.auto_model(**trans_features, **kwargs)["sentence_embedding"]
-    features.update({"sentence_embedding": sentence_embedding})
-    return features
-def jasper_vl_tokenize(self, texts: list[Dict] | list[str]) -> dict[str, torch.Tensor]:
-    img_start_token = "<|jasper_img_start|>"
-    img_token = "<|jasper_img_token|>"
-    img_end_token = "<|jasper_img_end|>"
-    num_img_tokens = 300
-    def process_text_item(item):
-        if isinstance(item, str):
-            return item, []
-        text, images = "", []
-        for sub_item in item:
-            if sub_item["type"] == "text":
-                text += sub_item["content"]
-            elif sub_item["type"] == "image_bytes":
-                text += img_start_token + img_token * num_img_tokens + img_end_token
-                images.append(PIL.Image.open(BytesIO(sub_item["content"])).convert("RGB"))
-            elif sub_item["type"] == "image_path":
-                text += img_start_token + img_token * num_img_tokens + img_end_token
-                images.append(PIL.Image.open(sub_item["content"]).convert("RGB"))
-            else:
-                raise ValueError(f"unknown data type {sub_item['type']}")
-        return text, images
-    all_texts, all_images = [], []
-    for item in texts:
-        text, images = process_text_item(item)
-        all_texts.append(text)
-        all_images.extend(images)
-    ipt = self.tokenizer(all_texts, padding="longest", truncation=True, max_length=1024, return_tensors="pt")
-    if all_images:
-        ipt["pixel_values"] = self.processor(
-            images=all_images,
-            return_tensors="pt"
-        )["pixel_values"]
-        # For the sake of demonstration, external variables are used here, please modify the code according to your own environment.
-        if use_gpu:
-            ipt["pixel_values"] = ipt["pixel_values"].bfloat16()
-    return ipt
 DOC1 = """
 Blue light is scattered in all directions by the tiny molecules of air in Earth's atmosphere.
 Blue is scattered more than other colors because it travels as shorter, smaller waves. This is why we see a blue sky most of the time.
@@ -9081,10 +9025,6 @@ Color combinations: Decide how to best complement your preferred color with othe
 Color palette: Limit your color palette to a main color and one or two additional colors.
 60-30-10 rule: Use a primary color 60% of the time, a secondary color 30% of the time, and an accent color 10% of the time
 """
-prompt_dict = {
-    "s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
-    "s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
-}
 if __name__ == "__main__":
     # load model
     use_gpu = False
@@ -9092,7 +9032,7 @@ if __name__ == "__main__":
     model = SentenceTransformer(
         model_name,
         trust_remote_code=True,
-        device="cpu",
         model_kwargs={
             "torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
             "attn_implementation": "sdpa"
@@ -9101,13 +9041,10 @@ if __name__ == "__main__":
         ## 1024 is recommended
         # set is_text_encoder 'True', if you do not encode image
         config_kwargs={"is_text_encoder": False, "vector_dim": 1024},
-        tokenizer_kwargs={"padding_side": "right"}
     )
-    # jasper model cannot directly be used in SentenceTransformer, do some modifications
-    model.processor = SiglipImageProcessor.from_pretrained(model_name)
-    model.tokenize = functools.partial(jasper_vl_tokenize, model)
-    model._first_module().forward = functools.partial(jasper_vl_forward, model._first_module())
     model.max_seq_length = 1024
     # data
     q_list = [
         "Why the sky is blue?",
@@ -9118,16 +9055,16 @@ if __name__ == "__main__":
         [{"type": "image_path", "content": "./assets/img1.png"}, {"type": "text", "content": "Hope this image helps!"}],
         DOC2,
         [{"type": "image_path", "content": "./assets/img2.png"}],
     ]
-    q_vecs = model.encode([prompt_dict["s2p_query"] + text for text in q_list], normalize_embeddings=True)
-    doc_vecs = model.encode(doc_list, normalize_embeddings=True)
-    print(np.matmul(q_vecs, doc_vecs.T))
-    # the output is:
-    # [[0.777521   0.75944513 0.24291277 0.2187205]
-    #  [0.32261407 0.30536035 0.74208796 0.5484469]]
 ```
 ## Evaluation on MTEB

 ## Usage
 ```python
 import torch
 from sentence_transformers import SentenceTransformer
 DOC1 = """
 Blue light is scattered in all directions by the tiny molecules of air in Earth's atmosphere.
 Blue is scattered more than other colors because it travels as shorter, smaller waves. This is why we see a blue sky most of the time.
 Color palette: Limit your color palette to a main color and one or two additional colors.
 60-30-10 rule: Use a primary color 60% of the time, a secondary color 30% of the time, and an accent color 10% of the time
 """
 if __name__ == "__main__":
     # load model
     use_gpu = False
     model = SentenceTransformer(
         model_name,
         trust_remote_code=True,
+        device="cpu" if not use_gpu else "cuda",
         model_kwargs={
             "torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
             "attn_implementation": "sdpa"
         ## 1024 is recommended
         # set is_text_encoder 'True', if you do not encode image
         config_kwargs={"is_text_encoder": False, "vector_dim": 1024},
     )
+    # We can reduce the max_seq_length from the default of 2048 for faster encoding
     model.max_seq_length = 1024
     # data
     q_list = [
         "Why the sky is blue?",
         [{"type": "image_path", "content": "./assets/img1.png"}, {"type": "text", "content": "Hope this image helps!"}],
         DOC2,
         [{"type": "image_path", "content": "./assets/img2.png"}],
     ]
+    q_vecs = model.encode(q_list, prompt_name="s2p_query")
+    doc_vecs = model.encode(doc_list)
+    # calculate similarity
+    similarities = model.similarity(q_vecs, doc_vecs)
+    print(similarities)
+    # the output is:
+    # tensor([[0.7775, 0.7594, 0.2429, 0.2187],
+    #         [0.3226, 0.3054, 0.7421, 0.5484]])
 ```
 ## Evaluation on MTEB

custom_st.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from typing import Any, Dict, Optional
+import PIL
+import torch
+import PIL
+import torch
+from typing import Dict
+from io import BytesIO
+from transformers import SiglipImageProcessor
+from sentence_transformers.models import Transformer as BaseTransformer
+class MultiModalTransformer(BaseTransformer):
+    def __init__(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        tokenizer_args: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        super().__init__(model_name_or_path, **kwargs)
+        if tokenizer_args is None:
+            tokenizer_args = {}
+        self.processor = SiglipImageProcessor.from_pretrained(
+            model_name_or_path, cache_dir=cache_dir, **tokenizer_args
+        )
+    def forward(
+        self, features: dict[str, torch.Tensor], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        trans_features = {
+            "input_ids": features["input_ids"],
+            "attention_mask": features["attention_mask"],
+        }
+        if "pixel_values" in features:
+            trans_features["pixel_values"] = features["pixel_values"].to(
+                self.auto_model.dtype
+            )
+        sentence_embedding = self.auto_model(**trans_features, **kwargs)[
+            "sentence_embedding"
+        ]
+        features.update({"sentence_embedding": sentence_embedding})
+        return features
+    def tokenize(self, texts: list[Dict] | list[str]) -> dict[str, torch.Tensor]:
+        img_start_token = "<|jasper_img_start|>"
+        img_token = "<|jasper_img_token|>"
+        img_end_token = "<|jasper_img_end|>"
+        num_img_tokens = 300
+        def process_text_item(item):
+            if isinstance(item, str):
+                return item, []
+            text, images = "", []
+            for sub_item in item:
+                if sub_item["type"] == "text":
+                    text += sub_item["content"]
+                elif sub_item["type"] == "image_bytes":
+                    text += img_start_token + img_token * num_img_tokens + img_end_token
+                    images.append(
+                        PIL.Image.open(BytesIO(sub_item["content"])).convert("RGB")
+                    )
+                elif sub_item["type"] == "image_path":
+                    text += img_start_token + img_token * num_img_tokens + img_end_token
+                    images.append(PIL.Image.open(sub_item["content"]).convert("RGB"))
+                else:
+                    raise ValueError(f"unknown data type {sub_item['type']}")
+            return text, images
+        all_texts, all_images = [], []
+        for item in texts:
+            text, images = process_text_item(item)
+            all_texts.append(text)
+            all_images.extend(images)
+        ipt = self.tokenizer(
+            all_texts,
+            padding="longest",
+            truncation=True,
+            max_length=self.max_seq_length,
+            return_tensors="pt",
+        )
+        if all_images:
+            ipt["pixel_values"] = self.processor(
+                images=all_images, return_tensors="pt"
+            )["pixel_values"]
+        return ipt

modules.json CHANGED Viewed

@@ -1,8 +1,14 @@
 [
- {
-  "idx": 0,
-  "name": "0",
-  "path": "",
-  "type": "sentence_transformers.models.Transformer"
- }
 ]

 [
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "custom_st.MultiModalTransformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
 ]

sentence_bert_config.json CHANGED Viewed

@@ -1,4 +1,7 @@
 {
  "max_seq_length": 2048,
- "do_lower_case": false
 }

 {
  "max_seq_length": 2048,
+ "do_lower_case": false,
+ "tokenizer_args": {
+    "padding_side": "right"
+ }
 }