Zhichao Geng commited on
Commit
b349e79
1 Parent(s): a43bd9e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +15 -9
README.md CHANGED
@@ -21,18 +21,20 @@ import itertools
21
  import torch
22
  from transformers import AutoModelForMaskedLM, AutoTokenizer
23
 
 
24
  # get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
25
  def get_sparse_vector(feature, output):
26
  values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
27
  values = torch.log(1 + torch.relu(values))
 
28
  return values
29
 
30
  # transform the sparse vector to a dict of (token, weight)
31
- def transform_sparse_vector_to_dict(sparse_vector, id_to_token):
32
  sample_indices,token_indices=torch.nonzero(sparse_vector,as_tuple=True)
33
  non_zero_values = sparse_vector[(sample_indices,token_indices)].tolist()
34
  number_of_tokens_for_each_sample = torch.bincount(sample_indices).cpu().tolist()
35
- tokens = [id_to_token[_id] for _id in token_indices.tolist()]
36
 
37
  output = []
38
  end_idxs = list(itertools.accumulate([0]+number_of_tokens_for_each_sample))
@@ -47,6 +49,16 @@ def transform_sparse_vector_to_dict(sparse_vector, id_to_token):
47
  model = AutoModelForMaskedLM.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
48
  tokenizer = AutoTokenizer.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
49
 
 
 
 
 
 
 
 
 
 
 
50
  query = "What's the weather in ny now?"
51
  document = "Currently New York is rainy."
52
 
@@ -59,13 +71,8 @@ sparse_vector = get_sparse_vector(feature, output)
59
  sim_score = torch.matmul(sparse_vector[0],sparse_vector[1])
60
  print(sim_score) # tensor(22.3299, grad_fn=<DotBackward0>)
61
 
62
- # get the array to transform token id to token string
63
- id_to_token = ["" for i in range(tokenizer.vocab_size)]
64
- for token, _id in tokenizer.vocab.items():
65
- id_to_token[_id] = token
66
-
67
 
68
- query_token_weight, document_query_token_weight = transform_sparse_vector_to_dict(sparse_vector, id_to_token)
69
  for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reverse=True):
70
  if token in document_query_token_weight:
71
  print("score in query: %.4f, score in document: %.4f, token: %s"%(query_token_weight[token],document_query_token_weight[token],token))
@@ -94,7 +101,6 @@ for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reve
94
  # score in query: 0.1191, score in document: 0.1533, token: nature
95
  # score in query: 0.0665, score in document: 0.0600, token: temperature
96
  # score in query: 0.0552, score in document: 0.3396, token: windy
97
-
98
  ```
99
 
100
  The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.
 
21
  import torch
22
  from transformers import AutoModelForMaskedLM, AutoTokenizer
23
 
24
+
25
  # get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
26
  def get_sparse_vector(feature, output):
27
  values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
28
  values = torch.log(1 + torch.relu(values))
29
+ values[:,special_token_ids] = 0
30
  return values
31
 
32
  # transform the sparse vector to a dict of (token, weight)
33
+ def transform_sparse_vector_to_dict(sparse_vector):
34
  sample_indices,token_indices=torch.nonzero(sparse_vector,as_tuple=True)
35
  non_zero_values = sparse_vector[(sample_indices,token_indices)].tolist()
36
  number_of_tokens_for_each_sample = torch.bincount(sample_indices).cpu().tolist()
37
+ tokens = [transform_sparse_vector_to_dict.id_to_token[_id] for _id in token_indices.tolist()]
38
 
39
  output = []
40
  end_idxs = list(itertools.accumulate([0]+number_of_tokens_for_each_sample))
 
49
  model = AutoModelForMaskedLM.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
50
  tokenizer = AutoTokenizer.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
51
 
52
+ # set the special tokens and id_to_token transform for post-process
53
+ special_token_ids = [tokenizer.vocab[token] for token in tokenizer.special_tokens_map.values()]
54
+ get_sparse_vector.special_token_ids = special_token_ids
55
+ id_to_token = ["" for i in range(tokenizer.vocab_size)]
56
+ for token, _id in tokenizer.vocab.items():
57
+ id_to_token[_id] = token
58
+ transform_sparse_vector_to_dict.id_to_token = id_to_token
59
+
60
+
61
+
62
  query = "What's the weather in ny now?"
63
  document = "Currently New York is rainy."
64
 
 
71
  sim_score = torch.matmul(sparse_vector[0],sparse_vector[1])
72
  print(sim_score) # tensor(22.3299, grad_fn=<DotBackward0>)
73
 
 
 
 
 
 
74
 
75
+ query_token_weight, document_query_token_weight = transform_sparse_vector_to_dict(sparse_vector)
76
  for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reverse=True):
77
  if token in document_query_token_weight:
78
  print("score in query: %.4f, score in document: %.4f, token: %s"%(query_token_weight[token],document_query_token_weight[token],token))
 
101
  # score in query: 0.1191, score in document: 0.1533, token: nature
102
  # score in query: 0.0665, score in document: 0.0600, token: temperature
103
  # score in query: 0.0552, score in document: 0.3396, token: windy
 
104
  ```
105
 
106
  The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.