Faisal AlKhateeb commited on
Commit
eb4f0e4
1 Parent(s): 19c3116

change mup param names

Browse files
Files changed (3) hide show
  1. config.json +4 -4
  2. configuration_btlm.py +14 -14
  3. modeling_btlm.py +10 -10
config.json CHANGED
@@ -15,7 +15,7 @@
15
  },
16
  "bos_token_id": 50256,
17
  "embd_pdrop": 0.0,
18
- "embeddings_scale": 14.6,
19
  "eos_token_id": 50256,
20
  "initializer_range": 0.073,
21
  "layer_norm_epsilon": 1e-05,
@@ -25,16 +25,16 @@
25
  "n_inner": 6826,
26
  "n_layer": 32,
27
  "n_positions": 8192,
28
- "output_logits_scale": 0.22200000000000003,
29
  "position_embedding_type": "alibi",
30
  "reorder_and_upcast_attn": false,
31
  "resid_pdrop": 0.0,
32
  "scale_attn_by_inverse_layer_idx": false,
33
  "scale_attn_weights": true,
34
- "scale_qk_dot_by_d": true,
35
  "torch_dtype": "bfloat16",
36
  "transformers_version": "4.30.0",
37
  "use_cache": true,
38
  "vocab_size": 50257,
39
- "width_scale": 0.1
40
  }
 
15
  },
16
  "bos_token_id": 50256,
17
  "embd_pdrop": 0.0,
18
+ "mup_embeddings_scale": 14.6,
19
  "eos_token_id": 50256,
20
  "initializer_range": 0.073,
21
  "layer_norm_epsilon": 1e-05,
 
25
  "n_inner": 6826,
26
  "n_layer": 32,
27
  "n_positions": 8192,
28
+ "mup_output_alpha": 2.2200000000000003,
29
  "position_embedding_type": "alibi",
30
  "reorder_and_upcast_attn": false,
31
  "resid_pdrop": 0.0,
32
  "scale_attn_by_inverse_layer_idx": false,
33
  "scale_attn_weights": true,
34
+ "mup_scale_qk_dot_by_d": true,
35
  "torch_dtype": "bfloat16",
36
  "transformers_version": "4.30.0",
37
  "use_cache": true,
38
  "vocab_size": 50257,
39
+ "mup_width_scale": 0.1
40
  }
configuration_btlm.py CHANGED
@@ -23,7 +23,7 @@ from transformers.utils import logging
23
  logger = logging.get_logger(__name__)
24
 
25
  BTLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
- "cerebras/BTLM-3B": "https://huggingface.co/cerebras/BTLM-3B/resolve/main/config.json",
27
  }
28
 
29
 
@@ -74,14 +74,14 @@ class BTLMConfig(PretrainedConfig):
74
  dot-product/softmax to float() when training with mixed precision.
75
  position_embedding_type (`str`, *optional*, defaults to `"learned"`):
76
  Positional embedding can be either `"alibi"` or `"learned"`.
77
- width_scale (`float`, *optional*, defaults to 1.0):
78
  muP parameter to scale learning rate and initializers. Calculated as (`d_model,0 / d_model`), where
79
  `d_model` is the model's width and `d_model,0` is the proxy model's width.
80
- embeddings_scale (`float`, *optional*, defaults to 1.0):
81
  muP parameter to scale token and position embeddings.
82
- output_logits_scale (`float`, *optional*, defaults to 1.0):
83
- muP parameter to scale output logits. Calculated as (`output_alpha * width_scale`)
84
- scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
85
  Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set
86
  scale_attn_weights to `True` as well.
87
 
@@ -130,10 +130,10 @@ class BTLMConfig(PretrainedConfig):
130
  scale_attn_by_inverse_layer_idx=False,
131
  reorder_and_upcast_attn=False,
132
  position_embedding_type="learned",
133
- width_scale=1.0,
134
- embeddings_scale=1.0,
135
- output_logits_scale=1.0,
136
- scale_qk_dot_by_d=False,
137
  **kwargs,
138
  ):
139
  self.vocab_size = vocab_size
@@ -157,9 +157,9 @@ class BTLMConfig(PretrainedConfig):
157
  self.eos_token_id = eos_token_id
158
 
159
  self.position_embedding_type = position_embedding_type
160
- self.width_scale = width_scale
161
- self.embeddings_scale = embeddings_scale
162
- self.output_logits_scale = output_logits_scale
163
- self.scale_qk_dot_by_d = scale_qk_dot_by_d
164
 
165
  super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
23
  logger = logging.get_logger(__name__)
24
 
25
  BTLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
+ "cerebras/btlm-3b-8k-base": "https://huggingface.co/cerebras/btlm-3b-8k-base/resolve/main/config.json",
27
  }
28
 
29
 
 
74
  dot-product/softmax to float() when training with mixed precision.
75
  position_embedding_type (`str`, *optional*, defaults to `"learned"`):
76
  Positional embedding can be either `"alibi"` or `"learned"`.
77
+ mup_width_scale (`float`, *optional*, defaults to 1.0):
78
  muP parameter to scale learning rate and initializers. Calculated as (`d_model,0 / d_model`), where
79
  `d_model` is the model's width and `d_model,0` is the proxy model's width.
80
+ mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
81
  muP parameter to scale token and position embeddings.
82
+ mup_output_alpha (`float`, *optional*, defaults to 1.0):
83
+ muP parameter to scale output logits (`output_logits_scale = mup_output_alpha * mup_width_scale`).
84
+ mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
85
  Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set
86
  scale_attn_weights to `True` as well.
87
 
 
130
  scale_attn_by_inverse_layer_idx=False,
131
  reorder_and_upcast_attn=False,
132
  position_embedding_type="learned",
133
+ mup_width_scale=1.0,
134
+ mup_embeddings_scale=1.0,
135
+ mup_output_alpha=1.0,
136
+ mup_scale_qk_dot_by_d=False,
137
  **kwargs,
138
  ):
139
  self.vocab_size = vocab_size
 
157
  self.eos_token_id = eos_token_id
158
 
159
  self.position_embedding_type = position_embedding_type
160
+ self.mup_width_scale = mup_width_scale
161
+ self.mup_embeddings_scale = mup_embeddings_scale
162
+ self.mup_output_alpha = mup_output_alpha
163
+ self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
164
 
165
  super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
modeling_btlm.py CHANGED
@@ -48,11 +48,11 @@ from .configuration_btlm import BTLMConfig
48
 
49
  logger = logging.get_logger(__name__)
50
 
51
- _CHECKPOINT_FOR_DOC = "cerebras/BTLM-3B"
52
  _CONFIG_FOR_DOC = "BTLMConfig"
53
 
54
  BTLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
55
- "cerebras/BTLM-3B",
56
  # See all BTLM models at https://huggingface.co/models?filter=btlm
57
  ]
58
 
@@ -204,7 +204,7 @@ class BTLMAttention(nn.Module):
204
 
205
  self.pruned_heads = set()
206
 
207
- self.attn_scale_power = 1.0 if config.scale_qk_dot_by_d else 0.5
208
 
209
  def prune_heads(self, heads):
210
  if len(heads) == 0:
@@ -511,7 +511,7 @@ class BTLMPreTrainedModel(PreTrainedModel):
511
 
512
  def _init_weights(self, module):
513
  """Initialize the weights."""
514
- mup_init_scale = math.sqrt(self.config.width_scale)
515
  if isinstance(module, (nn.Linear, Conv1D)):
516
  # Slightly different from the TF version which uses truncated_normal for initialization
517
  # cf https://github.com/pytorch/pytorch/pull/5617
@@ -576,7 +576,7 @@ class BTLMPreTrainedModel(PreTrainedModel):
576
  return 1
577
  return 0
578
 
579
- width_scale = self.config.width_scale
580
  new_param_groups = []
581
  new_param_groups.append({"params": [], "lr": lr * width_scale, "weight_decay": weight_decay})
582
  if not decoupled_wd:
@@ -754,7 +754,7 @@ class BTLMModel(BTLMPreTrainedModel):
754
  if config.position_embedding_type != "alibi"
755
  else None
756
  )
757
- self.embeddings_scale = config.embeddings_scale
758
 
759
  self.drop = nn.Dropout(config.embd_pdrop)
760
  self.h = nn.ModuleList([BTLMBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
@@ -1062,7 +1062,7 @@ class BTLMLMHeadModel(BTLMPreTrainedModel):
1062
  super().__init__(config)
1063
  self.transformer = BTLMModel(config)
1064
  self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
1065
- self.output_logits_scale = config.output_logits_scale
1066
 
1067
  # Model parallel
1068
  self.model_parallel = False
@@ -1264,7 +1264,7 @@ class BTLMForSequenceClassification(BTLMPreTrainedModel):
1264
  self.num_labels = config.num_labels
1265
  self.transformer = BTLMModel(config)
1266
  self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
1267
- self.output_logits_scale = config.output_logits_scale
1268
 
1269
  # Model parallel
1270
  self.model_parallel = False
@@ -1397,7 +1397,7 @@ class BTLMForTokenClassification(BTLMPreTrainedModel):
1397
  classifier_dropout = 0.1
1398
  self.dropout = nn.Dropout(classifier_dropout)
1399
  self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1400
- self.output_logits_scale = config.output_logits_scale
1401
 
1402
  # Model parallel
1403
  self.model_parallel = False
@@ -1492,7 +1492,7 @@ class BTLMForQuestionAnswering(BTLMPreTrainedModel):
1492
  self.num_labels = config.num_labels
1493
  self.transformer = BTLMModel(config)
1494
  self.qa_outputs = nn.Linear(config.hidden_size, 2)
1495
- self.output_logits_scale = config.output_logits_scale
1496
 
1497
  # Model parallel
1498
  self.model_parallel = False
 
48
 
49
  logger = logging.get_logger(__name__)
50
 
51
+ _CHECKPOINT_FOR_DOC = "cerebras/btlm-3b-8k-base"
52
  _CONFIG_FOR_DOC = "BTLMConfig"
53
 
54
  BTLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
55
+ "cerebras/btlm-3b-8k-base",
56
  # See all BTLM models at https://huggingface.co/models?filter=btlm
57
  ]
58
 
 
204
 
205
  self.pruned_heads = set()
206
 
207
+ self.attn_scale_power = 1.0 if config.mup_scale_qk_dot_by_d else 0.5
208
 
209
  def prune_heads(self, heads):
210
  if len(heads) == 0:
 
511
 
512
  def _init_weights(self, module):
513
  """Initialize the weights."""
514
+ mup_init_scale = math.sqrt(self.config.mup_width_scale)
515
  if isinstance(module, (nn.Linear, Conv1D)):
516
  # Slightly different from the TF version which uses truncated_normal for initialization
517
  # cf https://github.com/pytorch/pytorch/pull/5617
 
576
  return 1
577
  return 0
578
 
579
+ width_scale = self.config.mup_width_scale
580
  new_param_groups = []
581
  new_param_groups.append({"params": [], "lr": lr * width_scale, "weight_decay": weight_decay})
582
  if not decoupled_wd:
 
754
  if config.position_embedding_type != "alibi"
755
  else None
756
  )
757
+ self.embeddings_scale = config.mup_embeddings_scale
758
 
759
  self.drop = nn.Dropout(config.embd_pdrop)
760
  self.h = nn.ModuleList([BTLMBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
 
1062
  super().__init__(config)
1063
  self.transformer = BTLMModel(config)
1064
  self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
1065
+ self.output_logits_scale = config.mup_output_alpha * config.mup_width_scale
1066
 
1067
  # Model parallel
1068
  self.model_parallel = False
 
1264
  self.num_labels = config.num_labels
1265
  self.transformer = BTLMModel(config)
1266
  self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
1267
+ self.output_logits_scale = config.mup_output_alpha * config.mup_width_scale
1268
 
1269
  # Model parallel
1270
  self.model_parallel = False
 
1397
  classifier_dropout = 0.1
1398
  self.dropout = nn.Dropout(classifier_dropout)
1399
  self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1400
+ self.output_logits_scale = config.mup_output_alpha * config.mup_width_scale
1401
 
1402
  # Model parallel
1403
  self.model_parallel = False
 
1492
  self.num_labels = config.num_labels
1493
  self.transformer = BTLMModel(config)
1494
  self.qa_outputs = nn.Linear(config.hidden_size, 2)
1495
+ self.output_logits_scale = config.mup_output_alpha * config.mup_width_scale
1496
 
1497
  # Model parallel
1498
  self.model_parallel = False