prithivida
/

Splade_PP_en_v1

document-expansion

sparse representation

passage-retrieval

knowledge-distillation

document encoder

Inference Endpoints

Model card Files Files and versions

prithivida commited on Feb 16

Commit

20eab3a

•

1 Parent(s): 3fe3e6c

Update torch code

Files changed (1) hide show

README.md +12 -10

README.md CHANGED Viewed

@@ -140,17 +140,20 @@ sparse_rep = expander.expand(
 import torch
 from transformers import AutoModelForMaskedLM, AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained('prithivida/Splade_PP_en_v1')
 model = AutoModelForMaskedLM.from_pretrained('prithivida/Splade_PP_en_v1')
 sentence = """The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science."""
 inputs = tokenizer(sentence, return_tensors='pt')
 input_ids = inputs['input_ids']
 attention_mask = inputs['attention_mask']
 outputs = model(**inputs)
-print(outputs.logits.shape)
 logits, attention_mask = outputs.logits, attention_mask
 relu_log = torch.log(1 + torch.relu(logits))
@@ -158,19 +161,18 @@ weighted_log = relu_log * attention_mask.unsqueeze(-1)
 max_val, _ = torch.max(weighted_log, dim=1)
 vector = max_val.squeeze()
 cols = vector.nonzero().squeeze().cpu().tolist()
 weights = vector[cols].cpu().tolist()
-# Map indices to tokens and create a dictionary
-idx2token = {idx: token for token, idx in tokenizer.get_vocab().items()}
-token_weight_dict = {
-    idx2token[idx]: round(weight, 2) for idx, weight in zip(cols, weights)
-}
-# Sort the dictionary by weights in descending order
-sorted_token_weight_dict = {k: v for k, v in sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True) if v > 0}
-print(sorted_token_weight_dict)
 ```
 ## BEIR Zeroshot ODD performance:

 import torch
 from transformers import AutoModelForMaskedLM, AutoTokenizer
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
 tokenizer = AutoTokenizer.from_pretrained('prithivida/Splade_PP_en_v1')
 model = AutoModelForMaskedLM.from_pretrained('prithivida/Splade_PP_en_v1')
+model.to(device)
 sentence = """The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science."""
 inputs = tokenizer(sentence, return_tensors='pt')
+inputs = {key: val.to(device) for key, val in inputs.items()}
 input_ids = inputs['input_ids']
 attention_mask = inputs['attention_mask']
 outputs = model(**inputs)
 logits, attention_mask = outputs.logits, attention_mask
 relu_log = torch.log(1 + torch.relu(logits))
 max_val, _ = torch.max(weighted_log, dim=1)
 vector = max_val.squeeze()
 cols = vector.nonzero().squeeze().cpu().tolist()
+print("number of actual dimensions: ", len(cols))
 weights = vector[cols].cpu().tolist()
+d = {k: v for k, v in zip(cols, weights)}
+sorted_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
+bow_rep = []
+for k, v in sorted_d.items():
+    bow_rep.append((reverse_voc[k], round(v,2)))
+print("SPLADE BOW rep:\n", bow_rep)
 ```
 ## BEIR Zeroshot ODD performance: