DAMO-NLP-SG
/

CLEX-Mixtral-8x7B-Chat-32K

@@ -51,7 +51,7 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_mixtral_clex import MixtralConfig
 from .clex_layer import CLEXScalingRotaryEmbedding
 if is_flash_attn_2_available():
@@ -71,7 +71,7 @@ if is_torch_fx_available():
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "MixtralConfig"
 def load_balancing_loss_func(gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2) -> float:
@@ -254,7 +254,7 @@ class MixtralAttention(nn.Module):
     and "Generating Long Sequences with Sparse Transformers".
     """
-    def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -847,7 +847,7 @@ MIXTRAL_ATTENTION_CLASSES = {
 class MixtralBLockSparseTop2MLP(nn.Module):
-    def __init__(self, config: MixtralConfig):
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
@@ -935,7 +935,7 @@ class MixtralSparseMoeBlock(nn.Module):
 class MixtralDecoderLayer(nn.Module):
-    def __init__(self, config: MixtralConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -1024,7 +1024,7 @@ MIXTRAL_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`MixtralConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1037,7 +1037,7 @@ MIXTRAL_START_DOCSTRING = r"""
 )
 # Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Mixtral
 class MixtralPreTrainedModel(PreTrainedModel):
-    config_class = MixtralConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["MixtralDecoderLayer", "CLEXScalingRotaryEmbedding"]
@@ -1135,10 +1135,10 @@ class MixtralModel(MixtralPreTrainedModel):
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]
     Args:
-        config: MixtralConfig
     """
-    def __init__(self, config: MixtralConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size

     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_mixtral_clex import CLEXMixtralConfig
 from .clex_layer import CLEXScalingRotaryEmbedding
 if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "CLEXMixtralConfig"
 def load_balancing_loss_func(gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2) -> float:
     and "Generating Long Sequences with Sparse Transformers".
     """
+    def __init__(self, config: CLEXMixtralConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
 class MixtralBLockSparseTop2MLP(nn.Module):
+    def __init__(self, config: CLEXMixtralConfig):
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
 class MixtralDecoderLayer(nn.Module):
+    def __init__(self, config: CLEXMixtralConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
     and behavior.
     Parameters:
+        config ([`CLEXMixtralConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 )
 # Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Mixtral
 class MixtralPreTrainedModel(PreTrainedModel):
+    config_class = CLEXMixtralConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["MixtralDecoderLayer", "CLEXScalingRotaryEmbedding"]
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]
     Args:
+        config: CLEXMixtralConfig
     """
+    def __init__(self, config: CLEXMixtralConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size