DAMO-NLP-SG
/

CLEX-Mixtral-8x7B-32K

@@ -51,7 +51,7 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_mixtral_clex import MixtralConfig
 from .clex_layer import CLEXScalingRotaryEmbedding
 if is_flash_attn_2_available():
@@ -71,7 +71,7 @@ if is_torch_fx_available():
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "MixtralConfig"
 def load_balancing_loss_func(gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2) -> float:
@@ -254,7 +254,7 @@ class MixtralAttention(nn.Module):
     and "Generating Long Sequences with Sparse Transformers".
     """
-    def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -847,7 +847,7 @@ MIXTRAL_ATTENTION_CLASSES = {
 class MixtralBLockSparseTop2MLP(nn.Module):
-    def __init__(self, config: MixtralConfig):
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
@@ -935,7 +935,7 @@ class MixtralSparseMoeBlock(nn.Module):
 class MixtralDecoderLayer(nn.Module):
-    def __init__(self, config: MixtralConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -1024,7 +1024,7 @@ MIXTRAL_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`MixtralConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1037,7 +1037,7 @@ MIXTRAL_START_DOCSTRING = r"""
 )
 # Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Mixtral
 class MixtralPreTrainedModel(PreTrainedModel):
-    config_class = MixtralConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["MixtralDecoderLayer", "CLEXScalingRotaryEmbedding"]
@@ -1135,10 +1135,10 @@ class MixtralModel(MixtralPreTrainedModel):
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]
     Args:
-        config: MixtralConfig
     """
-    def __init__(self, config: MixtralConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -1410,6 +1410,7 @@ class MixtralForCausalLM(MixtralPreTrainedModel):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
             output_router_logits if output_router_logits is not None else self.config.output_router_logits

     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_mixtral_clex import CLEXMixtralConfig
 from .clex_layer import CLEXScalingRotaryEmbedding
 if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "CLEXMixtralConfig"
 def load_balancing_loss_func(gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2) -> float:
     and "Generating Long Sequences with Sparse Transformers".
     """
+    def __init__(self, config: CLEXMixtralConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
 class MixtralBLockSparseTop2MLP(nn.Module):
+    def __init__(self, config: CLEXMixtralConfig):
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
 class MixtralDecoderLayer(nn.Module):
+    def __init__(self, config: CLEXMixtralConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
     and behavior.
     Parameters:
+        config ([`CLEXMixtralConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 )
 # Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Mixtral
 class MixtralPreTrainedModel(PreTrainedModel):
+    config_class = CLEXMixtralConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["MixtralDecoderLayer", "CLEXScalingRotaryEmbedding"]
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]
     Args:
+        config: CLEXMixtralConfig
     """
+    def __init__(self, config: CLEXMixtralConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
+        print(input_ids[0,20:30].tolist())
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
             output_router_logits if output_router_logits is not None else self.config.output_router_logits