BAAI
/

bge-large-en

@@ -48,9 +48,19 @@ Then you can use the model like this:
 ```python
 from sentence_transformers import SentenceTransformer
 sentences = ["样例数据-1", "样例数据-2"]
-model = SentenceTransformer('BAAI/baai-general-embedding-large-en-instruction')
 embeddings = model.encode(sentences, normalize_embeddings=True)
 print(embeddings)
 ```
@@ -62,16 +72,22 @@ from transformers import AutoTokenizer, AutoModel
 import torch
 # Sentences we want sentence embeddings for
 sentences = ["样例数据-1", "样例数据-2"]
 # Load model from HuggingFace Hub
 tokenizer = AutoTokenizer.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
 model = AutoModel.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
 # Tokenize sentences
 encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
 # Compute token embeddings
 with torch.no_grad():
     model_output = model(**encoded_input)
     # Perform pooling. In this case, cls pooling.
     sentence_embeddings = model_output[0][:, 0]
 # normalize embeddings
 sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
 print("Sentence embeddings:")

 ```python
 from sentence_transformers import SentenceTransformer
 sentences = ["样例数据-1", "样例数据-2"]
+model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction')
 embeddings = model.encode(sentences, normalize_embeddings=True)
 print(embeddings)
+#For retrieval task, when you use the model whose name ends with `-instruction`
+#each query should start with a instruction.
+queries = ["手机开不了机怎么办？"]
+passages = ["样例段落-1", "样例段落-2"]
+instruction = "为这个句子生成表示以用于检索相关文章："
+model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction')
+q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True)
+p_embeddings = model.encode(passages, normalize_embeddings=True)
+scores = q_embeddings @ p_embeddings.T
 ```
 import torch
 # Sentences we want sentence embeddings for
 sentences = ["样例数据-1", "样例数据-2"]
 # Load model from HuggingFace Hub
 tokenizer = AutoTokenizer.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
 model = AutoModel.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
 # Tokenize sentences
 encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# For retrieval task, need to add an instruction to query when using the "*-instruction" model.
+# encoded_input = tokenizer(["为这个句子生成表示以用于检索相关文章：" + query for query in queries], padding=True, truncation=True, return_tensors='pt')
 # Compute token embeddings
 with torch.no_grad():
     model_output = model(**encoded_input)
     # Perform pooling. In this case, cls pooling.
     sentence_embeddings = model_output[0][:, 0]
 # normalize embeddings
 sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
 print("Sentence embeddings:")