Zhichao Geng
commited on
Commit
•
b349e79
1
Parent(s):
a43bd9e
Update README.md
Browse files
README.md
CHANGED
@@ -21,18 +21,20 @@ import itertools
|
|
21 |
import torch
|
22 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
23 |
|
|
|
24 |
# get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
|
25 |
def get_sparse_vector(feature, output):
|
26 |
values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
|
27 |
values = torch.log(1 + torch.relu(values))
|
|
|
28 |
return values
|
29 |
|
30 |
# transform the sparse vector to a dict of (token, weight)
|
31 |
-
def transform_sparse_vector_to_dict(sparse_vector
|
32 |
sample_indices,token_indices=torch.nonzero(sparse_vector,as_tuple=True)
|
33 |
non_zero_values = sparse_vector[(sample_indices,token_indices)].tolist()
|
34 |
number_of_tokens_for_each_sample = torch.bincount(sample_indices).cpu().tolist()
|
35 |
-
tokens = [id_to_token[_id] for _id in token_indices.tolist()]
|
36 |
|
37 |
output = []
|
38 |
end_idxs = list(itertools.accumulate([0]+number_of_tokens_for_each_sample))
|
@@ -47,6 +49,16 @@ def transform_sparse_vector_to_dict(sparse_vector, id_to_token):
|
|
47 |
model = AutoModelForMaskedLM.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
48 |
tokenizer = AutoTokenizer.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
query = "What's the weather in ny now?"
|
51 |
document = "Currently New York is rainy."
|
52 |
|
@@ -59,13 +71,8 @@ sparse_vector = get_sparse_vector(feature, output)
|
|
59 |
sim_score = torch.matmul(sparse_vector[0],sparse_vector[1])
|
60 |
print(sim_score) # tensor(22.3299, grad_fn=<DotBackward0>)
|
61 |
|
62 |
-
# get the array to transform token id to token string
|
63 |
-
id_to_token = ["" for i in range(tokenizer.vocab_size)]
|
64 |
-
for token, _id in tokenizer.vocab.items():
|
65 |
-
id_to_token[_id] = token
|
66 |
-
|
67 |
|
68 |
-
query_token_weight, document_query_token_weight = transform_sparse_vector_to_dict(sparse_vector
|
69 |
for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reverse=True):
|
70 |
if token in document_query_token_weight:
|
71 |
print("score in query: %.4f, score in document: %.4f, token: %s"%(query_token_weight[token],document_query_token_weight[token],token))
|
@@ -94,7 +101,6 @@ for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reve
|
|
94 |
# score in query: 0.1191, score in document: 0.1533, token: nature
|
95 |
# score in query: 0.0665, score in document: 0.0600, token: temperature
|
96 |
# score in query: 0.0552, score in document: 0.3396, token: windy
|
97 |
-
|
98 |
```
|
99 |
|
100 |
The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.
|
|
|
21 |
import torch
|
22 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
23 |
|
24 |
+
|
25 |
# get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
|
26 |
def get_sparse_vector(feature, output):
|
27 |
values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
|
28 |
values = torch.log(1 + torch.relu(values))
|
29 |
+
values[:,special_token_ids] = 0
|
30 |
return values
|
31 |
|
32 |
# transform the sparse vector to a dict of (token, weight)
|
33 |
+
def transform_sparse_vector_to_dict(sparse_vector):
|
34 |
sample_indices,token_indices=torch.nonzero(sparse_vector,as_tuple=True)
|
35 |
non_zero_values = sparse_vector[(sample_indices,token_indices)].tolist()
|
36 |
number_of_tokens_for_each_sample = torch.bincount(sample_indices).cpu().tolist()
|
37 |
+
tokens = [transform_sparse_vector_to_dict.id_to_token[_id] for _id in token_indices.tolist()]
|
38 |
|
39 |
output = []
|
40 |
end_idxs = list(itertools.accumulate([0]+number_of_tokens_for_each_sample))
|
|
|
49 |
model = AutoModelForMaskedLM.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
50 |
tokenizer = AutoTokenizer.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
51 |
|
52 |
+
# set the special tokens and id_to_token transform for post-process
|
53 |
+
special_token_ids = [tokenizer.vocab[token] for token in tokenizer.special_tokens_map.values()]
|
54 |
+
get_sparse_vector.special_token_ids = special_token_ids
|
55 |
+
id_to_token = ["" for i in range(tokenizer.vocab_size)]
|
56 |
+
for token, _id in tokenizer.vocab.items():
|
57 |
+
id_to_token[_id] = token
|
58 |
+
transform_sparse_vector_to_dict.id_to_token = id_to_token
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
query = "What's the weather in ny now?"
|
63 |
document = "Currently New York is rainy."
|
64 |
|
|
|
71 |
sim_score = torch.matmul(sparse_vector[0],sparse_vector[1])
|
72 |
print(sim_score) # tensor(22.3299, grad_fn=<DotBackward0>)
|
73 |
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
query_token_weight, document_query_token_weight = transform_sparse_vector_to_dict(sparse_vector)
|
76 |
for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reverse=True):
|
77 |
if token in document_query_token_weight:
|
78 |
print("score in query: %.4f, score in document: %.4f, token: %s"%(query_token_weight[token],document_query_token_weight[token],token))
|
|
|
101 |
# score in query: 0.1191, score in document: 0.1533, token: nature
|
102 |
# score in query: 0.0665, score in document: 0.0600, token: temperature
|
103 |
# score in query: 0.0552, score in document: 0.3396, token: windy
|
|
|
104 |
```
|
105 |
|
106 |
The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.
|