ver217 commited on
Commit
f7c6e7f
1 Parent(s): 527550f

[hotfix] update ffn dim

Browse files
Files changed (2) hide show
  1. configuration_grok1.py +2 -2
  2. modeling_grok1.py +3 -4
configuration_grok1.py CHANGED
@@ -9,7 +9,7 @@ class Grok1Config(PretrainedConfig):
9
  self,
10
  vocab_size=32000,
11
  hidden_size=4096,
12
- widening_factor=4.0,
13
  num_hidden_layers=32,
14
  num_attention_heads=32,
15
  num_key_value_heads=32,
@@ -37,7 +37,7 @@ class Grok1Config(PretrainedConfig):
37
  self.embedding_multiplier_scale = embedding_multiplier_scale
38
  self.output_multiplier_scale = output_multiplier_scale
39
  self.hidden_size = hidden_size
40
- self.widening_factor = widening_factor
41
  self.num_hidden_layers = num_hidden_layers
42
  self.num_attention_heads = num_attention_heads
43
 
 
9
  self,
10
  vocab_size=32000,
11
  hidden_size=4096,
12
+ intermediate_size=32768,
13
  num_hidden_layers=32,
14
  num_attention_heads=32,
15
  num_key_value_heads=32,
 
37
  self.embedding_multiplier_scale = embedding_multiplier_scale
38
  self.output_multiplier_scale = output_multiplier_scale
39
  self.hidden_size = hidden_size
40
+ self.intermediate_size = intermediate_size
41
  self.num_hidden_layers = num_hidden_layers
42
  self.num_attention_heads = num_attention_heads
43
 
modeling_grok1.py CHANGED
@@ -395,11 +395,11 @@ class DecoderLayer(nn.Module):
395
  def __init__(
396
  self,
397
  hidden_size: int,
 
398
  num_heads: int,
399
  num_key_value_heads: int,
400
  num_experts: int,
401
  top_k: int,
402
- widening_factor: float = 4.0,
403
  max_position_embeddings: int = 2048,
404
  attn_output_multiplier: float = 1.0,
405
  max_attn_val: float = 30.0,
@@ -414,8 +414,7 @@ class DecoderLayer(nn.Module):
414
  attn_output_multiplier=attn_output_multiplier,
415
  max_attn_val=max_attn_val,
416
  )
417
- ffn_dim = int(hidden_size * widening_factor)
418
- self.moe_block = MoeBlock(hidden_size, ffn_dim, num_experts, top_k)
419
  self.pre_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
420
  self.post_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
421
  self.pre_moe_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
@@ -543,11 +542,11 @@ class Grok1Model(Grok1PretrainedModel):
543
  [
544
  DecoderLayer(
545
  hidden_size=config.hidden_size,
 
546
  num_heads=config.num_attention_heads,
547
  num_key_value_heads=config.num_key_value_heads,
548
  num_experts=config.num_experts,
549
  top_k=config.num_experts_per_tok,
550
- widening_factor=config.widening_factor,
551
  max_position_embeddings=config.max_position_embeddings,
552
  attn_output_multiplier=config.attn_output_multiplier,
553
  max_attn_val=config.max_attn_value,
 
395
  def __init__(
396
  self,
397
  hidden_size: int,
398
+ intermediate_size: int,
399
  num_heads: int,
400
  num_key_value_heads: int,
401
  num_experts: int,
402
  top_k: int,
 
403
  max_position_embeddings: int = 2048,
404
  attn_output_multiplier: float = 1.0,
405
  max_attn_val: float = 30.0,
 
414
  attn_output_multiplier=attn_output_multiplier,
415
  max_attn_val=max_attn_val,
416
  )
417
+ self.moe_block = MoeBlock(hidden_size, intermediate_size, num_experts, top_k)
 
418
  self.pre_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
419
  self.post_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
420
  self.pre_moe_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
 
542
  [
543
  DecoderLayer(
544
  hidden_size=config.hidden_size,
545
+ intermediate_size=config.intermediate_size,
546
  num_heads=config.num_attention_heads,
547
  num_key_value_heads=config.num_key_value_heads,
548
  num_experts=config.num_experts,
549
  top_k=config.num_experts_per_tok,
 
550
  max_position_embeddings=config.max_position_embeddings,
551
  attn_output_multiplier=config.attn_output_multiplier,
552
  max_attn_val=config.max_attn_value,