Update README.md
Browse files
README.md
CHANGED
|
@@ -133,6 +133,20 @@ import torch
|
|
| 133 |
import torch.nn.functional as F
|
| 134 |
from transformers import AutoModel, AutoTokenizer
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
model_name_or_path = "nvidia/llama-embed-nemotron-8b"
|
| 137 |
|
| 138 |
attn_implementation = "flash_attention_2" if torch.cuda.is_available() else "eager"
|
|
@@ -153,10 +167,6 @@ model = AutoModel.from_pretrained(
|
|
| 153 |
).eval()
|
| 154 |
model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 155 |
|
| 156 |
-
# Define task and queries
|
| 157 |
-
def get_instruction(task_instruction: str, query: str) -> str:
|
| 158 |
-
return f"Instruct: {task_instruction}\nQuery: {query}"
|
| 159 |
-
|
| 160 |
# Model is instruction-aware, which requires each query to have a short instruction with the task instruction
|
| 161 |
task = "Given a question, retrieve passages that answer the question"
|
| 162 |
queries = [
|
|
@@ -181,17 +191,15 @@ batch_dict = tokenizer(
|
|
| 181 |
attention_mask = batch_dict["attention_mask"]
|
| 182 |
|
| 183 |
# Forward pass
|
| 184 |
-
|
| 185 |
|
| 186 |
# Average pooling
|
| 187 |
-
embeddings = last_hidden_state
|
| 188 |
|
| 189 |
-
# Embedding normalization
|
| 190 |
-
embeddings = F.normalize(embeddings, dim=-1)
|
| 191 |
scores = (embeddings[:1] @ embeddings[1:].T)
|
| 192 |
|
| 193 |
print(scores.tolist())
|
| 194 |
-
# [[0.
|
| 195 |
```
|
| 196 |
|
| 197 |
## Software Integration:
|
|
|
|
| 133 |
import torch.nn.functional as F
|
| 134 |
from transformers import AutoModel, AutoTokenizer
|
| 135 |
|
| 136 |
+
|
| 137 |
+
def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
| 138 |
+
"""Average pooling with attention mask."""
|
| 139 |
+
|
| 140 |
+
last_hidden_states_masked = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
|
| 141 |
+
embedding = last_hidden_states_masked.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
|
| 142 |
+
embedding = F.normalize(embedding, dim=-1)
|
| 143 |
+
|
| 144 |
+
return embedding
|
| 145 |
+
|
| 146 |
+
# Define task and queries
|
| 147 |
+
def get_instruction(task_instruction: str, query: str) -> str:
|
| 148 |
+
return f"Instruct: {task_instruction}\nQuery: {query}"
|
| 149 |
+
|
| 150 |
model_name_or_path = "nvidia/llama-embed-nemotron-8b"
|
| 151 |
|
| 152 |
attn_implementation = "flash_attention_2" if torch.cuda.is_available() else "eager"
|
|
|
|
| 167 |
).eval()
|
| 168 |
model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
# Model is instruction-aware, which requires each query to have a short instruction with the task instruction
|
| 171 |
task = "Given a question, retrieve passages that answer the question"
|
| 172 |
queries = [
|
|
|
|
| 191 |
attention_mask = batch_dict["attention_mask"]
|
| 192 |
|
| 193 |
# Forward pass
|
| 194 |
+
model_outputs = model(**batch_dict)
|
| 195 |
|
| 196 |
# Average pooling
|
| 197 |
+
embeddings = average_pool(model_outputs.last_hidden_state, attention_mask)
|
| 198 |
|
|
|
|
|
|
|
| 199 |
scores = (embeddings[:1] @ embeddings[1:].T)
|
| 200 |
|
| 201 |
print(scores.tolist())
|
| 202 |
+
# [[0.37646484375, 0.0579833984375]]
|
| 203 |
```
|
| 204 |
|
| 205 |
## Software Integration:
|