Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 14, 2024

Commit

2e51e15

verified ·

1 Parent(s): 292a484

Update configuration_gemmoe.py

Browse files

Files changed (1) hide show

configuration_gemmoe.py +0 -44

configuration_gemmoe.py CHANGED Viewed

@@ -92,30 +92,6 @@ class GemmoeConfig(PretrainedConfig):
         output_router_logits (`bool`, *optional*, defaults to `False`):
             Whether or not to output the logits of the routers. They are useful for computing the router loss, and
             should not be returned during inference.
-        n_shared_experts (`int`, *optional*, defaults to `None`):
-            The number of shared experts used in the sparse mixture of experts layer. If set to `None`, no shared
-            experts are used.
-        n_routed_experts (`int`, *optional*, defaults to `None`):
-            The number of routed experts used in the sparse mixture of experts layer. If set to `None`, all experts are
-            routed experts.
-        moe_layer_freq (`int`, *optional*, defaults to 1):
-            The frequency of MoE layers in the model. A value of 1 means MoE layers are used in every layer, a value of
-            2 means MoE layers are used in every other layer, and so on.
-        first_k_dense_replace (`int`, *optional*, defaults to 0):
-            The number of initial dense layers to replace with MoE layers. If set to 0 (default), no dense layers are
-            replaced.
-        norm_topk_prob (`bool`, *optional*, defaults to `False`):
-            Whether to normalize the top-k probabilities of the router during training.
-        scoring_func (`str`, *optional*, defaults to `'softmax'`):
-            The scoring function used by the router. Can be 'softmax' or 'remap'.
-        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
-            The weight of the auxiliary loss used for training the router.
-        seq_aux (`bool`, *optional*, defaults to `True`):
-            Whether to use sequence-level auxiliary loss for training the router.
-        pretraining_tp (`int`, *optional*, defaults to 1):
-            The tensor parallelism used for pretraining.
-        rope_scaling (`float`, *optional*, defaults to `None`):
-            The scaling factor for the Rotary Position Embedding (RoPE). If set to `None`, no scaling is applied.
     ```python
     >>> from transformers import GemmoeModel, GemmoeConfig
@@ -156,16 +132,6 @@ class GemmoeConfig(PretrainedConfig):
         attention_dropout=0.0,
         num_experts_per_tok=2,
         num_local_experts=8,
-        n_shared_experts=8,
-        n_routed_experts=2,
-        moe_layer_freq=1,
-        first_k_dense_replace=0,
-        norm_topk_prob=False,
-        scoring_func='softmax',
-        aux_loss_alpha=0.001,
-        seq_aux=True,
-        pretraining_tp=1,
-        rope_scaling=None,
         router_aux_loss_coef=0.02,
         output_router_logits=False,
         **kwargs,
@@ -187,16 +153,6 @@ class GemmoeConfig(PretrainedConfig):
         self.attention_dropout = attention_dropout
         self.num_experts_per_tok = num_experts_per_tok
         self.num_local_experts = num_local_experts
-        self.n_shared_experts = n_shared_experts
-        self.n_routed_experts = n_routed_experts
-        self.moe_layer_freq = moe_layer_freq
-        self.first_k_dense_replace = first_k_dense_replace
-        self.norm_topk_prob = norm_topk_prob
-        self.scoring_func = scoring_func
-        self.aux_loss_alpha = aux_loss_alpha
-        self.seq_aux = seq_aux
-        self.pretraining_tp = pretraining_tp
-        self.rope_scaling = rope_scaling
         self.router_aux_loss_coef = router_aux_loss_coef
         self.output_router_logits = output_router_logits

         output_router_logits (`bool`, *optional*, defaults to `False`):
             Whether or not to output the logits of the routers. They are useful for computing the router loss, and
             should not be returned during inference.
     ```python
     >>> from transformers import GemmoeModel, GemmoeConfig
         attention_dropout=0.0,
         num_experts_per_tok=2,
         num_local_experts=8,
         router_aux_loss_coef=0.02,
         output_router_logits=False,
         **kwargs,
         self.attention_dropout = attention_dropout
         self.num_experts_per_tok = num_experts_per_tok
         self.num_local_experts = num_local_experts
         self.router_aux_loss_coef = router_aux_loss_coef
         self.output_router_logits = output_router_logits