Muennighoff commited on
Commit
7251f46
·
1 Parent(s): def21c3
Files changed (3) hide show
  1. config.json +1 -1
  2. config_molmoe.py +8 -5
  3. modeling_molmoe.py +27 -20
config.json CHANGED
@@ -7,7 +7,7 @@
7
  "AutoModelForCausalLM": "modeling_molmoe.MOLMoEForCausalLM"
8
  },
9
  "clip_qkv": null,
10
- "embedding_size": 152064,
11
  "hidden_size": 2048,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 1024,
 
7
  "AutoModelForCausalLM": "modeling_molmoe.MOLMoEForCausalLM"
8
  },
9
  "clip_qkv": null,
10
+ "embedding_size": 50304,
11
  "hidden_size": 2048,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 1024,
config_molmoe.py CHANGED
@@ -5,19 +5,22 @@ from transformers import PretrainedConfig, AutoTokenizer
5
 
6
  def config_to_moe_args(config):
7
  from megablocks.layers.arguments import Arguments as MoEArgs
 
 
 
8
 
9
  kwargs = {
10
  "activation_fn": F.silu,
11
  "mlp_type": "glu" if "glu" in config.activation_type.lower() else "mlp",
12
  "mlp_impl": "sparse",
13
- "hidden_size": config.hidden_size,
14
- "ffn_hidden_size": config.intermediate_size,
15
- "moe_num_experts": config.moe_num_experts,
16
- "num_layers": config.num_hidden_layers,
17
  # Handled by FSDP (https://github.com/databricks/megablocks/issues/57#issuecomment-1854594483)
18
  "moe_weight_parallelism": False,
19
  "moe_expert_model_parallelism": False,
20
- "moe_top_k": config.moe_top_k,
21
  # "moe_loss_weight": config.moe_loss_weight,
22
  # "device": config.init_device,
23
  # Handled by FSDP
 
5
 
6
  def config_to_moe_args(config):
7
  from megablocks.layers.arguments import Arguments as MoEArgs
8
+ import torch.nn.functional as F
9
+
10
+ # import pdb; pdb.set_trace()
11
 
12
  kwargs = {
13
  "activation_fn": F.silu,
14
  "mlp_type": "glu" if "glu" in config.activation_type.lower() else "mlp",
15
  "mlp_impl": "sparse",
16
+ "hidden_size": config.d_model,
17
+ "ffn_hidden_size": config.mlp_hidden_size,
18
+ "moe_num_experts": 64,
19
+ "num_layers": config.n_layers,
20
  # Handled by FSDP (https://github.com/databricks/megablocks/issues/57#issuecomment-1854594483)
21
  "moe_weight_parallelism": False,
22
  "moe_expert_model_parallelism": False,
23
+ "moe_top_k": 8,
24
  # "moe_loss_weight": config.moe_loss_weight,
25
  # "device": config.init_device,
26
  # Handled by FSDP
modeling_molmoe.py CHANGED
@@ -235,14 +235,16 @@ class OLMoBlock(nn.Module):
235
  device=config.init_device
236
  )
237
 
238
- # Feed-forward output projection.
239
- self.ff_out = nn.Linear(
240
- int(self.act.output_multiplier * self.hidden_size),
241
- config.d_model,
242
- bias=config.include_bias,
243
- device=config.init_device,
244
- )
245
- self.ff_out._is_residual = True # type: ignore
 
 
246
 
247
  # Rotary embeddings.
248
  if self.config.rope:
@@ -423,7 +425,7 @@ class OLMoBlock(nn.Module):
423
  return OLMoSequentialBlock(layer_id, config, cache)
424
  elif config.block_type == "llama":
425
  return OLMoLlamaBlock(layer_id, config, cache)
426
- elif config.block_type == BlockType.moe:
427
  return OLMoEBlock(layer_id, config, cache)
428
  else:
429
  raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
@@ -725,7 +727,7 @@ class OLMoEBlock(OLMoBlock):
725
  (plus another skip connection).
726
  """
727
 
728
- def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
729
  try:
730
  from megablocks.layers.dmoe import dMoE
731
  from megablocks.layers.moe import MoE
@@ -733,12 +735,12 @@ class OLMoEBlock(OLMoBlock):
733
  raise ImportError(
734
  "To train MoEs, run `pip install git+https://github.com/Muennighoff/megablocks.git@olmoe`"
735
  )
736
- from .config import config_to_moe_args
737
 
738
  super().__init__(layer_id, config, cache)
739
 
740
  self.moe_args = config_to_moe_args(config)
741
- self.ffn = dMoE(self.moe_args) if self.config.moe_dropless else MoE(self.moe_args)
742
 
743
  self.attn_norm = LayerNorm.build(config)
744
  self.ff_norm = LayerNorm.build(config)
@@ -956,12 +958,14 @@ class VisionBackboneConfig:
956
  image_default_input_size: Tuple[int, int] = (336, 336)
957
  image_patch_size: int = 14
958
  image_pos_patch_size: int = 14
 
959
  image_emb_dim: int = 1024
960
  image_num_heads: int = 16
961
  image_num_key_value_heads: int = 16
962
  image_num_layers: int = 24
963
  image_head_dim: int = 64
964
- image_mlp_dim: int = 4096
 
965
  image_mlp_activations: str = "gelu"
966
  image_dropout_rate: float = 0.0
967
  image_num_pos: int = 577
@@ -990,10 +994,10 @@ class FullMolmoeConfig:
990
  qkv_bias: bool = False
991
  clip_qkv: Optional[float] = None
992
  n_layers: int = 12
993
- mlp_ratio: int = 4
994
  mlp_hidden_size: Optional[int] = None
995
  activation_type: str = "swiglu"
996
- block_type: str = "sequential"
997
  block_group_size: int = 1
998
  alibi: bool = False
999
  alibi_bias_max: float = 8.0
@@ -1009,7 +1013,7 @@ class FullMolmoeConfig:
1009
  attention_dropout: float = 0.1
1010
  response_attention_dropout: float = 0.0
1011
  multi_query_attention: Optional[bool] = None
1012
- attention_layer_norm: bool = False
1013
  residual_dropout: float = 0.1
1014
  response_residual_dropout: float = 0.0
1015
  embedding_dropout: float = 0.1
@@ -1651,6 +1655,9 @@ class OLMoVisionBackbone(nn.Module):
1651
  [MLP(mlp_config, input_dim), Residual(MLP(config, input_dim))]
1652
  )
1653
  elif config.image_projector == ImageProjectType.mlp:
 
 
 
1654
  self.image_projector = MLP(mlp_config, input_dim)
1655
  elif config.image_projector == ImageProjectType.linear:
1656
  self.image_projector = nn.Linear(
@@ -2423,7 +2430,7 @@ class MOLMoEForCausalLM(PreTrainedModel):
2423
  base_model_prefix = "model"
2424
  _no_split_modules = ["OLMoBlock"]
2425
 
2426
- def __init__(self, config: MolmoeConfig, model: Optional[MOLMo] = None, init_params: bool = False):
2427
  super().__init__(config)
2428
 
2429
  if not model:
@@ -2447,8 +2454,8 @@ class MOLMoEForCausalLM(PreTrainedModel):
2447
  additional_vocab_size=128,
2448
  n_heads=config.num_attention_heads,
2449
  n_kv_heads=config.num_key_value_heads,
2450
- rope_theta=1000000.0,
2451
- layer_norm_eps=1e-6,
2452
  layer_norm_type="rms",
2453
  pad_tokenizer=True,
2454
  vit_layers=[-2, -9],
@@ -2472,7 +2479,7 @@ class MOLMoEForCausalLM(PreTrainedModel):
2472
  initializer_range=0.02,
2473
  )
2474
  )
2475
- self.model = MOLMo(full_config, init_params=init_params)
2476
  else:
2477
  self.model = model
2478
 
 
235
  device=config.init_device
236
  )
237
 
238
+
239
+ if self.config.block_type != "moe":
240
+ # Feed-forward output projection.
241
+ self.ff_out = nn.Linear(
242
+ int(self.act.output_multiplier * self.hidden_size),
243
+ config.d_model,
244
+ bias=config.include_bias,
245
+ device=config.init_device,
246
+ )
247
+ self.ff_out._is_residual = True # type: ignore
248
 
249
  # Rotary embeddings.
250
  if self.config.rope:
 
425
  return OLMoSequentialBlock(layer_id, config, cache)
426
  elif config.block_type == "llama":
427
  return OLMoLlamaBlock(layer_id, config, cache)
428
+ elif config.block_type == "moe":
429
  return OLMoEBlock(layer_id, config, cache)
430
  else:
431
  raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
 
727
  (plus another skip connection).
728
  """
729
 
730
+ def __init__(self, layer_id: int, config, cache: BufferCache):
731
  try:
732
  from megablocks.layers.dmoe import dMoE
733
  from megablocks.layers.moe import MoE
 
735
  raise ImportError(
736
  "To train MoEs, run `pip install git+https://github.com/Muennighoff/megablocks.git@olmoe`"
737
  )
738
+ from .config_molmoe import config_to_moe_args
739
 
740
  super().__init__(layer_id, config, cache)
741
 
742
  self.moe_args = config_to_moe_args(config)
743
+ self.ffn = dMoE(self.moe_args)
744
 
745
  self.attn_norm = LayerNorm.build(config)
746
  self.ff_norm = LayerNorm.build(config)
 
958
  image_default_input_size: Tuple[int, int] = (336, 336)
959
  image_patch_size: int = 14
960
  image_pos_patch_size: int = 14
961
+ # image_emb_dim: int = 1024
962
  image_emb_dim: int = 1024
963
  image_num_heads: int = 16
964
  image_num_key_value_heads: int = 16
965
  image_num_layers: int = 24
966
  image_head_dim: int = 64
967
+ # image_mlp_dim: int = 4096
968
+ image_mlp_dim: int = 2048
969
  image_mlp_activations: str = "gelu"
970
  image_dropout_rate: float = 0.0
971
  image_num_pos: int = 577
 
994
  qkv_bias: bool = False
995
  clip_qkv: Optional[float] = None
996
  n_layers: int = 12
997
+ mlp_ratio: int = 1
998
  mlp_hidden_size: Optional[int] = None
999
  activation_type: str = "swiglu"
1000
+ block_type: str = "moe"
1001
  block_group_size: int = 1
1002
  alibi: bool = False
1003
  alibi_bias_max: float = 8.0
 
1013
  attention_dropout: float = 0.1
1014
  response_attention_dropout: float = 0.0
1015
  multi_query_attention: Optional[bool] = None
1016
+ attention_layer_norm: bool = True
1017
  residual_dropout: float = 0.1
1018
  response_residual_dropout: float = 0.0
1019
  embedding_dropout: float = 0.1
 
1655
  [MLP(mlp_config, input_dim), Residual(MLP(config, input_dim))]
1656
  )
1657
  elif config.image_projector == ImageProjectType.mlp:
1658
+ #import pdb; pdb.set_trace()
1659
+ #mlp_config.image_mlp_dim = 2048
1660
+ mlp_config.mlp_hidden_size = 2048
1661
  self.image_projector = MLP(mlp_config, input_dim)
1662
  elif config.image_projector == ImageProjectType.linear:
1663
  self.image_projector = nn.Linear(
 
2430
  base_model_prefix = "model"
2431
  _no_split_modules = ["OLMoBlock"]
2432
 
2433
+ def __init__(self, config: MolmoeConfig, model: Optional[MOLMoE] = None, init_params: bool = False):
2434
  super().__init__(config)
2435
 
2436
  if not model:
 
2454
  additional_vocab_size=128,
2455
  n_heads=config.num_attention_heads,
2456
  n_kv_heads=config.num_key_value_heads,
2457
+ rope_theta=10000.0,
2458
+ layer_norm_eps=1e-5,
2459
  layer_norm_type="rms",
2460
  pad_tokenizer=True,
2461
  vit_layers=[-2, -9],
 
2479
  initializer_range=0.02,
2480
  )
2481
  )
2482
+ self.model = MOLMoE(full_config, init_params=init_params)
2483
  else:
2484
  self.model = model
2485