nvidia
/

llama-embed-nemotron-8b

Feature Extraction

sentence-transformers

sentence-similarity

Model card Files Files and versions

ybabakhin commited on 16 days ago

Commit

8850046

·

verified ·

1 Parent(s): 8a80716

Update README.md

Files changed (1) hide show

README.md +17 -9

README.md CHANGED Viewed

@@ -133,6 +133,20 @@ import torch
 import torch.nn.functional as F
 from transformers import AutoModel, AutoTokenizer
 model_name_or_path = "nvidia/llama-embed-nemotron-8b"
 attn_implementation = "flash_attention_2" if torch.cuda.is_available() else "eager"
@@ -153,10 +167,6 @@ model = AutoModel.from_pretrained(
 ).eval()
 model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
-# Define task and queries
-def get_instruction(task_instruction: str, query: str) -> str:
-    return f"Instruct: {task_instruction}\nQuery: {query}"
 # Model is instruction-aware, which requires each query to have a short instruction with the task instruction
 task = "Given a question, retrieve passages that answer the question"
 queries = [
@@ -181,17 +191,15 @@ batch_dict = tokenizer(
 attention_mask = batch_dict["attention_mask"]
 # Forward pass
-last_hidden_state = model(**batch_dict).last_hidden_state.to(torch.float32)
 # Average pooling
-embeddings = last_hidden_state.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
-# Embedding normalization
-embeddings = F.normalize(embeddings, dim=-1)
 scores = (embeddings[:1] @ embeddings[1:].T)
 print(scores.tolist())
-# [[0.46564924716949463, 0.05839264765381813]]
 ```
 ## Software Integration:

 import torch.nn.functional as F
 from transformers import AutoModel, AutoTokenizer
+def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """Average pooling with attention mask."""
+    last_hidden_states_masked = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    embedding = last_hidden_states_masked.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+    embedding = F.normalize(embedding, dim=-1)
+    return embedding
+# Define task and queries
+def get_instruction(task_instruction: str, query: str) -> str:
+    return f"Instruct: {task_instruction}\nQuery: {query}"
 model_name_or_path = "nvidia/llama-embed-nemotron-8b"
 attn_implementation = "flash_attention_2" if torch.cuda.is_available() else "eager"
 ).eval()
 model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
 # Model is instruction-aware, which requires each query to have a short instruction with the task instruction
 task = "Given a question, retrieve passages that answer the question"
 queries = [
 attention_mask = batch_dict["attention_mask"]
 # Forward pass
+model_outputs = model(**batch_dict)
 # Average pooling
+embeddings = average_pool(model_outputs.last_hidden_state, attention_mask)
 scores = (embeddings[:1] @ embeddings[1:].T)
 print(scores.tolist())
+# [[0.37646484375, 0.0579833984375]]
 ```
 ## Software Integration: