ccdv commited on
Commit
adc1a36
1 Parent(s): b3c2a0d

add mask_first_token

Browse files
Files changed (3) hide show
  1. README.md +5 -1
  2. config.json +1 -0
  3. modeling_lsg_distilbert.py +5 -0
README.md CHANGED
@@ -50,13 +50,17 @@ You can change various parameters like :
50
  Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.
51
 
52
  ```python:
 
 
53
  model = AutoModel.from_pretrained("ccdv/lsg-distilbert-base-uncased-4096",
54
  trust_remote_code=True,
55
  num_global_tokens=16,
56
  block_size=64,
57
  sparse_block_size=64,
58
- sparsity_factor=4,
59
  attention_probs_dropout_prob=0.0
 
 
 
60
  )
61
  ```
62
 
 
50
  Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.
51
 
52
  ```python:
53
+ from transformers import AutoModel
54
+
55
  model = AutoModel.from_pretrained("ccdv/lsg-distilbert-base-uncased-4096",
56
  trust_remote_code=True,
57
  num_global_tokens=16,
58
  block_size=64,
59
  sparse_block_size=64,
 
60
  attention_probs_dropout_prob=0.0
61
+ sparsity_factor=4,
62
+ sparsity_type="none",
63
+ mask_first_token=True
64
  )
65
  ```
66
 
config.json CHANGED
@@ -22,6 +22,7 @@
22
  "hidden_dim": 3072,
23
  "initializer_range": 0.02,
24
  "lsh_num_pre_rounds": 1,
 
25
  "max_position_embeddings": 4096,
26
  "model_type": "distilbert",
27
  "n_heads": 12,
 
22
  "hidden_dim": 3072,
23
  "initializer_range": 0.02,
24
  "lsh_num_pre_rounds": 1,
25
+ "mask_first_token": false,
26
  "max_position_embeddings": 4096,
27
  "model_type": "distilbert",
28
  "n_heads": 12,
modeling_lsg_distilbert.py CHANGED
@@ -27,6 +27,7 @@ class LSGDistilBertConfig(DistilBertConfig):
27
  base_model_prefix="lsg",
28
  block_size=128,
29
  lsh_num_pre_rounds=1,
 
30
  num_global_tokens=1,
31
  pool_with_global=True,
32
  sparse_block_size=128,
@@ -42,6 +43,7 @@ class LSGDistilBertConfig(DistilBertConfig):
42
  self.base_model_prefix = base_model_prefix
43
  self.block_size = block_size
44
  self.lsh_num_pre_rounds = lsh_num_pre_rounds
 
45
  self.num_global_tokens = num_global_tokens
46
  self.pool_with_global = pool_with_global
47
  self.sparse_block_size = sparse_block_size
@@ -925,6 +927,7 @@ class LSGDistilBertModel(LSGDistilBertPreTrainedModel, DistilBertModel):
925
  assert hasattr(config, "block_size") and hasattr(config, "adaptive")
926
  self.block_size = config.block_size
927
  self.adaptive = config.adaptive
 
928
  self.pool_with_global = config.pool_with_global
929
 
930
  # Initialize weights and apply final processing
@@ -946,6 +949,8 @@ class LSGDistilBertModel(LSGDistilBertPreTrainedModel, DistilBertModel):
946
 
947
  if attention_mask is None:
948
  attention_mask = torch.ones(n, t, device=inputs_.device)
 
 
949
 
950
  b = self.block_size * 2
951
  pad = t % self.block_size
 
27
  base_model_prefix="lsg",
28
  block_size=128,
29
  lsh_num_pre_rounds=1,
30
+ mask_first_token=False,
31
  num_global_tokens=1,
32
  pool_with_global=True,
33
  sparse_block_size=128,
 
43
  self.base_model_prefix = base_model_prefix
44
  self.block_size = block_size
45
  self.lsh_num_pre_rounds = lsh_num_pre_rounds
46
+ self.mask_first_token = mask_first_token
47
  self.num_global_tokens = num_global_tokens
48
  self.pool_with_global = pool_with_global
49
  self.sparse_block_size = sparse_block_size
 
927
  assert hasattr(config, "block_size") and hasattr(config, "adaptive")
928
  self.block_size = config.block_size
929
  self.adaptive = config.adaptive
930
+ self.mask_first_token = config.first_token
931
  self.pool_with_global = config.pool_with_global
932
 
933
  # Initialize weights and apply final processing
 
949
 
950
  if attention_mask is None:
951
  attention_mask = torch.ones(n, t, device=inputs_.device)
952
+ if self.mask_first_token:
953
+ attention_mask[:,0] = 0
954
 
955
  b = self.block_size * 2
956
  pad = t % self.block_size