ybabakhin commited on
Commit
8850046
·
verified ·
1 Parent(s): 8a80716

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +17 -9
README.md CHANGED
@@ -133,6 +133,20 @@ import torch
133
  import torch.nn.functional as F
134
  from transformers import AutoModel, AutoTokenizer
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  model_name_or_path = "nvidia/llama-embed-nemotron-8b"
137
 
138
  attn_implementation = "flash_attention_2" if torch.cuda.is_available() else "eager"
@@ -153,10 +167,6 @@ model = AutoModel.from_pretrained(
153
  ).eval()
154
  model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
155
 
156
- # Define task and queries
157
- def get_instruction(task_instruction: str, query: str) -> str:
158
- return f"Instruct: {task_instruction}\nQuery: {query}"
159
-
160
  # Model is instruction-aware, which requires each query to have a short instruction with the task instruction
161
  task = "Given a question, retrieve passages that answer the question"
162
  queries = [
@@ -181,17 +191,15 @@ batch_dict = tokenizer(
181
  attention_mask = batch_dict["attention_mask"]
182
 
183
  # Forward pass
184
- last_hidden_state = model(**batch_dict).last_hidden_state.to(torch.float32)
185
 
186
  # Average pooling
187
- embeddings = last_hidden_state.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
188
 
189
- # Embedding normalization
190
- embeddings = F.normalize(embeddings, dim=-1)
191
  scores = (embeddings[:1] @ embeddings[1:].T)
192
 
193
  print(scores.tolist())
194
- # [[0.46564924716949463, 0.05839264765381813]]
195
  ```
196
 
197
  ## Software Integration:
 
133
  import torch.nn.functional as F
134
  from transformers import AutoModel, AutoTokenizer
135
 
136
+
137
+ def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
138
+ """Average pooling with attention mask."""
139
+
140
+ last_hidden_states_masked = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
141
+ embedding = last_hidden_states_masked.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
142
+ embedding = F.normalize(embedding, dim=-1)
143
+
144
+ return embedding
145
+
146
+ # Define task and queries
147
+ def get_instruction(task_instruction: str, query: str) -> str:
148
+ return f"Instruct: {task_instruction}\nQuery: {query}"
149
+
150
  model_name_or_path = "nvidia/llama-embed-nemotron-8b"
151
 
152
  attn_implementation = "flash_attention_2" if torch.cuda.is_available() else "eager"
 
167
  ).eval()
168
  model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
169
 
 
 
 
 
170
  # Model is instruction-aware, which requires each query to have a short instruction with the task instruction
171
  task = "Given a question, retrieve passages that answer the question"
172
  queries = [
 
191
  attention_mask = batch_dict["attention_mask"]
192
 
193
  # Forward pass
194
+ model_outputs = model(**batch_dict)
195
 
196
  # Average pooling
197
+ embeddings = average_pool(model_outputs.last_hidden_state, attention_mask)
198
 
 
 
199
  scores = (embeddings[:1] @ embeddings[1:].T)
200
 
201
  print(scores.tolist())
202
+ # [[0.37646484375, 0.0579833984375]]
203
  ```
204
 
205
  ## Software Integration: