Shitao commited on
Commit
51d05f5
1 Parent(s): 3490d29

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +17 -1
README.md CHANGED
@@ -48,9 +48,19 @@ Then you can use the model like this:
48
  ```python
49
  from sentence_transformers import SentenceTransformer
50
  sentences = ["样例数据-1", "样例数据-2"]
51
- model = SentenceTransformer('BAAI/baai-general-embedding-large-en-instruction')
52
  embeddings = model.encode(sentences, normalize_embeddings=True)
53
  print(embeddings)
 
 
 
 
 
 
 
 
 
 
54
  ```
55
 
56
 
@@ -62,16 +72,22 @@ from transformers import AutoTokenizer, AutoModel
62
  import torch
63
  # Sentences we want sentence embeddings for
64
  sentences = ["样例数据-1", "样例数据-2"]
 
65
  # Load model from HuggingFace Hub
66
  tokenizer = AutoTokenizer.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
67
  model = AutoModel.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
 
68
  # Tokenize sentences
69
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
 
 
 
70
  # Compute token embeddings
71
  with torch.no_grad():
72
  model_output = model(**encoded_input)
73
  # Perform pooling. In this case, cls pooling.
74
  sentence_embeddings = model_output[0][:, 0]
 
75
  # normalize embeddings
76
  sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
77
  print("Sentence embeddings:")
 
48
  ```python
49
  from sentence_transformers import SentenceTransformer
50
  sentences = ["样例数据-1", "样例数据-2"]
51
+ model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction')
52
  embeddings = model.encode(sentences, normalize_embeddings=True)
53
  print(embeddings)
54
+
55
+ #For retrieval task, when you use the model whose name ends with `-instruction`
56
+ #each query should start with a instruction.
57
+ queries = ["手机开不了机怎么办?"]
58
+ passages = ["样例段落-1", "样例段落-2"]
59
+ instruction = "为这个句子生成表示以用于检索相关文章:"
60
+ model = SentenceTransformer('BAAI/baai-general-embedding-large-zh-instruction')
61
+ q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True)
62
+ p_embeddings = model.encode(passages, normalize_embeddings=True)
63
+ scores = q_embeddings @ p_embeddings.T
64
  ```
65
 
66
 
 
72
  import torch
73
  # Sentences we want sentence embeddings for
74
  sentences = ["样例数据-1", "样例数据-2"]
75
+
76
  # Load model from HuggingFace Hub
77
  tokenizer = AutoTokenizer.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
78
  model = AutoModel.from_pretrained('BAAI/baai-general-embedding-large-en-instruction')
79
+
80
  # Tokenize sentences
81
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
82
+ # For retrieval task, need to add an instruction to query when using the "*-instruction" model.
83
+ # encoded_input = tokenizer(["为这个句子生成表示以用于检索相关文章:" + query for query in queries], padding=True, truncation=True, return_tensors='pt')
84
+
85
  # Compute token embeddings
86
  with torch.no_grad():
87
  model_output = model(**encoded_input)
88
  # Perform pooling. In this case, cls pooling.
89
  sentence_embeddings = model_output[0][:, 0]
90
+
91
  # normalize embeddings
92
  sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
93
  print("Sentence embeddings:")