Markus28 commited on
Commit
535ad9a
1 Parent(s): b563469

Set activation_checkpoint_lvl to 100 by default

Browse files
Files changed (1) hide show
  1. configuration_bert.py +4 -3
configuration_bert.py CHANGED
@@ -55,9 +55,10 @@ class JinaBertConfig(PretrainedConfig):
55
  layer_norm_eps (`float`, *optional*, defaults to 1e-12):
56
  The epsilon used by the layer normalization layers.
57
  window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
58
- activation_checkpoint_lvl (`int`, *optional*, defaults to `0`): How many layers to activation-checkpoint.
59
  If larger than 0, the MLP activation checkpointing level is expected to be 0 for the first
60
- `activation_checkpoint_lvl` layers.
 
61
  """
62
 
63
  model_type = "bert"
@@ -89,7 +90,7 @@ class JinaBertConfig(PretrainedConfig):
89
  emb_pooler=None,
90
  classifier_dropout=None,
91
  num_loras=5,
92
- activation_checkpoint_lvl=0,
93
  **kwargs,
94
  ):
95
  assert 'position_embedding_type' not in kwargs
 
55
  layer_norm_eps (`float`, *optional*, defaults to 1e-12):
56
  The epsilon used by the layer normalization layers.
57
  window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
58
+ activation_checkpoint_lvl (`int`, *optional*, defaults to `100`): How many layers to activation-checkpoint.
59
  If larger than 0, the MLP activation checkpointing level is expected to be 0 for the first
60
+ `activation_checkpoint_lvl` layers. The activation checkpointing will only come into effect
61
+ after `model.gradient_checkpointing_enable()` is called.
62
  """
63
 
64
  model_type = "bert"
 
90
  emb_pooler=None,
91
  classifier_dropout=None,
92
  num_loras=5,
93
+ activation_checkpoint_lvl=100,
94
  **kwargs,
95
  ):
96
  assert 'position_embedding_type' not in kwargs