Kaguya-19
commited on
Commit
•
1023e8b
1
Parent(s):
5d2fd73
fit for Sentence Transformer
Browse files- README.md +92 -23
- config.json +1 -1
README.md
CHANGED
@@ -85,40 +85,53 @@ flash-attn>2.3.5
|
|
85 |
|
86 |
### 示例脚本 Demo
|
87 |
|
|
|
|
|
88 |
```python
|
89 |
-
from transformers import AutoModel,
|
90 |
import torch
|
91 |
import numpy as np
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
model_name = "openbmb/MiniCPM-Reranker"
|
94 |
-
tokenizer =
|
95 |
tokenizer.padding_side = "right"
|
|
|
96 |
model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True,attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
|
97 |
model.eval()
|
98 |
-
max_len_q, max_len_d = 512, 512
|
99 |
-
|
100 |
-
def tokenize_our(query,doc):
|
101 |
-
input_id_query = tokenizer.encode(query, add_special_tokens=False, max_length=max_len_q, truncation=True)
|
102 |
-
input_id_doc = tokenizer.encode(doc, add_special_tokens=False, max_length=max_len_d, truncation=True)
|
103 |
-
pad_input = {"input_ids": [tokenizer.bos_token_id] + input_id_query + [tokenizer.eos_token_id] + input_id_doc}
|
104 |
-
return tokenizer.pad(
|
105 |
-
pad_input,
|
106 |
-
padding="max_length",
|
107 |
-
max_length=max_len_q + max_len_d + 2,
|
108 |
-
return_tensors="pt",
|
109 |
-
)
|
110 |
|
111 |
@torch.no_grad()
|
112 |
def rerank(input_query, input_docs):
|
113 |
-
tokenized_inputs = [
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
for k in input_ids:
|
120 |
-
input_ids[k] = torch.stack(input_ids[k]).to("cuda")
|
121 |
-
outputs = model(**input_ids)
|
122 |
score = outputs.logits
|
123 |
return score.float().detach().cpu().numpy()
|
124 |
|
@@ -136,6 +149,62 @@ for i in range(len(queries)):
|
|
136 |
print(np.array(scores)) # [[[-4.7460938][-8.8515625]]]
|
137 |
```
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
## 实验结果 Evaluation Results
|
140 |
|
141 |
### 中文与英文重排序结果 CN/EN Re-ranking Results
|
|
|
85 |
|
86 |
### 示例脚本 Demo
|
87 |
|
88 |
+
#### Huggingface Transformers
|
89 |
+
|
90 |
```python
|
91 |
+
from transformers import AutoModel, LlamaTokenizer, AutoModelForSequenceClassification
|
92 |
import torch
|
93 |
import numpy as np
|
94 |
|
95 |
+
# from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
|
96 |
+
class MiniCPMRerankerLLamaTokenizer(LlamaTokenizer):
|
97 |
+
def build_inputs_with_special_tokens(
|
98 |
+
self, token_ids_0, token_ids_1 = None
|
99 |
+
):
|
100 |
+
"""
|
101 |
+
- single sequence: `<s> X </s>`
|
102 |
+
- pair of sequences: `<s> A </s> B`
|
103 |
+
|
104 |
+
Args:
|
105 |
+
token_ids_0 (`List[int]`):
|
106 |
+
List of IDs to which the special tokens will be added.
|
107 |
+
token_ids_1 (`List[int]`, *optional*):
|
108 |
+
Optional second list of IDs for sequence pairs.
|
109 |
+
|
110 |
+
Returns:
|
111 |
+
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
112 |
+
"""
|
113 |
+
|
114 |
+
if token_ids_1 is None:
|
115 |
+
return super().build_inputs_with_special_tokens(token_ids_0)
|
116 |
+
bos = [self.bos_token_id]
|
117 |
+
sep = [self.eos_token_id]
|
118 |
+
return bos + token_ids_0 + sep + token_ids_1
|
119 |
+
|
120 |
model_name = "openbmb/MiniCPM-Reranker"
|
121 |
+
tokenizer = MiniCPMRerankerLLamaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
122 |
tokenizer.padding_side = "right"
|
123 |
+
|
124 |
model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True,attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
|
125 |
model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
@torch.no_grad()
|
128 |
def rerank(input_query, input_docs):
|
129 |
+
tokenized_inputs = tokenizer([[input_query, input_doc] for input_doc in input_docs], return_tensors="pt", padding=True, truncation=True, max_length=1024)
|
130 |
+
|
131 |
+
for k in tokenized_inputs:
|
132 |
+
tokenized_inputs [k] = tokenized_inputs[k].to("cuda")
|
133 |
+
|
134 |
+
outputs = model(**tokenized_inputs)
|
|
|
|
|
|
|
135 |
score = outputs.logits
|
136 |
return score.float().detach().cpu().numpy()
|
137 |
|
|
|
149 |
print(np.array(scores)) # [[[-4.7460938][-8.8515625]]]
|
150 |
```
|
151 |
|
152 |
+
#### Sentence Transformer
|
153 |
+
|
154 |
+
```python
|
155 |
+
from sentence_transformers import CrossEncoder
|
156 |
+
from transformers import LlamaTokenizer
|
157 |
+
import torch
|
158 |
+
|
159 |
+
# from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
|
160 |
+
class MiniCPMRerankerLLamaTokenizer(LlamaTokenizer):
|
161 |
+
def build_inputs_with_special_tokens(
|
162 |
+
self, token_ids_0, token_ids_1 = None
|
163 |
+
):
|
164 |
+
"""
|
165 |
+
- single sequence: `<s> X </s>`
|
166 |
+
- pair of sequences: `<s> A </s> B`
|
167 |
+
|
168 |
+
Args:
|
169 |
+
token_ids_0 (`List[int]`):
|
170 |
+
List of IDs to which the special tokens will be added.
|
171 |
+
token_ids_1 (`List[int]`, *optional*):
|
172 |
+
Optional second list of IDs for sequence pairs.
|
173 |
+
|
174 |
+
Returns:
|
175 |
+
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
176 |
+
"""
|
177 |
+
|
178 |
+
if token_ids_1 is None:
|
179 |
+
return super().build_inputs_with_special_tokens(token_ids_0)
|
180 |
+
bos = [self.bos_token_id]
|
181 |
+
sep = [self.eos_token_id]
|
182 |
+
return bos + token_ids_0 + sep + token_ids_1
|
183 |
+
|
184 |
+
model_name = "openbmb/MiniCPM-Reranker"
|
185 |
+
model = CrossEncoder(model_name,max_length=1024,trust_remote_code=True, automodel_args={"attn_implementation":"flash_attention_2","torch_dtype": torch.float16})
|
186 |
+
model.tokenizer = MiniCPMRerankerLLamaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
187 |
+
model.tokenizer.padding_side = "right"
|
188 |
+
|
189 |
+
query = "中国的首都是哪里?"
|
190 |
+
passages = [["beijing", "shanghai"]]
|
191 |
+
|
192 |
+
INSTRUCTION = "Query: "
|
193 |
+
query = INSTRUCTION + query
|
194 |
+
|
195 |
+
sentence_pairs = [[query, doc] for doc in passages]
|
196 |
+
|
197 |
+
scores = model.predict(sentence_pairs, convert_to_tensor=True).tolist()
|
198 |
+
rankings = model.rank(query, passages, return_documents=True, convert_to_tensor=True)
|
199 |
+
|
200 |
+
print(scores) # [0.0087432861328125, 0.00020503997802734375]
|
201 |
+
for ranking in rankings:
|
202 |
+
print(f"Score: {ranking['score']:.4f}, Corpus: {ranking['text']}")
|
203 |
+
|
204 |
+
# ID: 0, Score: 0.0087, Text: beijing
|
205 |
+
# ID: 1, Score: 0.0002, Text: shanghai
|
206 |
+
```
|
207 |
+
|
208 |
## 实验结果 Evaluation Results
|
209 |
|
210 |
### 中文与英文重排序结果 CN/EN Re-ranking Results
|
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "openbmb/
|
3 |
"architectures": [
|
4 |
"MiniCPM"
|
5 |
],
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "openbmb/MiniCPM-Reranker",
|
3 |
"architectures": [
|
4 |
"MiniCPM"
|
5 |
],
|