Add first-party Sentence Transformers support + README snippet (#1)

Browse files

- Add first-party Sentence Transformers support + snippet (0737ee5e0072b3e79863d63f5ab2b4bcb573b442)

Files changed (6) hide show

1_Pooling/config.json +9 -0
README.md +32 -1
config_sentence_transformers.json +7 -0
modules.json +14 -0
sentence_bert_config.json +4 -0
tokenizer_config.json +1 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "word_embedding_dimension": 4096,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": true
+}

README.md CHANGED Viewed

@@ -3287,12 +3287,15 @@ This project is for research purposes only. Third-party datasets may be subject
 More technical details will be updated later.
 ## How to run
 The models can be used as follows:
 ```python
 import torch
 import torch.nn.functional as F
 from torch import Tensor
 from transformers import AutoTokenizer, AutoModel
 def last_token_pool(last_hidden_states: Tensor,
                  attention_mask: Tensor) -> Tensor:
     left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
@@ -3321,7 +3324,6 @@ passages = [
 # load model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-Mistral')
 model = AutoModel.from_pretrained('Salesforce/SFR-Embedding-Mistral')
-tokenizer.add_eos_token = True
 # get the embeddings
 max_length = 4096
@@ -3334,6 +3336,35 @@ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_ma
 embeddings = F.normalize(embeddings, p=2, dim=1)
 scores = (embeddings[:2] @ embeddings[2:].T) * 100
 print(scores.tolist())
 ```
 Code for MTEB evaluation will be added soon.

 More technical details will be updated later.
 ## How to run
+### Transformers
 The models can be used as follows:
 ```python
 import torch
 import torch.nn.functional as F
 from torch import Tensor
 from transformers import AutoTokenizer, AutoModel
 def last_token_pool(last_hidden_states: Tensor,
                  attention_mask: Tensor) -> Tensor:
     left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
 # load model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-Mistral')
 model = AutoModel.from_pretrained('Salesforce/SFR-Embedding-Mistral')
 # get the embeddings
 max_length = 4096
 embeddings = F.normalize(embeddings, p=2, dim=1)
 scores = (embeddings[:2] @ embeddings[2:].T) * 100
 print(scores.tolist())
+# [[86.7153549194336, 36.64569091796875], [35.00493621826172, 82.0738525390625]]
+```
+### Sentence Transformers
+```python
+from sentence_transformers import SentenceTransformer, util
+model = SentenceTransformer("Salesforce/SFR-Embedding-Mistral")
+def get_detailed_instruct(task_description: str, query: str) -> str:
+    return f'Instruct: {task_description}\nQuery: {query}'
+# Each query must come with a one-sentence instruction that describes the task
+task = 'Given a web search query, retrieve relevant passages that answer the query'
+queries = [
+    get_detailed_instruct(task, 'How to bake a chocolate cake'),
+    get_detailed_instruct(task, 'Symptoms of the flu')
+]
+# No need to add instruction for retrieval documents
+passages = [
+    "To bake a delicious chocolate cake, you'll need the following ingredients: all-purpose flour, sugar, cocoa powder, baking powder, baking soda, salt, eggs, milk, vegetable oil, and vanilla extract. Start by preheating your oven to 350°F (175°C). In a mixing bowl, combine the dry ingredients (flour, sugar, cocoa powder, baking powder, baking soda, and salt). In a separate bowl, whisk together the wet ingredients (eggs, milk, vegetable oil, and vanilla extract). Gradually add the wet mixture to the dry ingredients, stirring until well combined. Pour the batter into a greased cake pan and bake for 30-35 minutes. Let it cool before frosting with your favorite chocolate frosting. Enjoy your homemade chocolate cake!",
+    "The flu, or influenza, is an illness caused by influenza viruses. Common symptoms of the flu include a high fever, chills, cough, sore throat, runny or stuffy nose, body aches, headache, fatigue, and sometimes nausea and vomiting. These symptoms can come on suddenly and are usually more severe than the common cold. It's important to get plenty of rest, stay hydrated, and consult a healthcare professional if you suspect you have the flu. In some cases, antiviral medications can help alleviate symptoms and reduce the duration of the illness."
+]
+embeddings = model.encode(queries + passages)
+scores = util.cos_sim(embeddings[:2], embeddings[2:]) * 100
+print(scores.tolist())
+# [[86.71537780761719, 36.645721435546875], [35.00497055053711, 82.07388305664062]]
 ```
 Code for MTEB evaluation will be added soon.

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.2.2",
+    "transformers": "4.37.2",
+    "pytorch": "2.1.0+cu121"
+  }
+}

modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 4096,
+  "do_lower_case": false
+}

tokenizer_config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",

 {
+  "add_eos_token": true,
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",