Update modeling_Llamoe.py

Browse files

Files changed (1) hide show

modeling_Llamoe.py +9 -31

modeling_Llamoe.py CHANGED Viewed

@@ -747,7 +747,7 @@ class LlamoeSdpaAttention(LlamoeAttention):
         return attn_output, None, past_key_value
-LLAMA_ATTENTION_CLASSES = {
     "eager": LlamoeAttention,
     "flash_attention_2": LlamoeFlashAttention2,
     "sdpa": LlamoeSdpaAttention,
@@ -833,7 +833,7 @@ class LlamoeDecoderLayer(nn.Module):
-LLAMA_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -851,12 +851,8 @@ LLAMA_START_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-@add_start_docstrings(
-    "The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
-    GEMMOE_START_DOCSTRING,
 )
 class LlammoePreTrainedModel(PreTrainedModel):
@@ -903,7 +899,7 @@ class LlammoePreTrainedModel(PreTrainedModel):
             layer.self_attn.past_key_value = None
-GEMMOE_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -967,14 +963,14 @@ GEMMOE_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
     "The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
-    GEMMOE_START_DOCSTRING,
 )
 class LlamoeModel(LlammoePreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GemmoeDecoderLayer`]
     Args:
-        config: GemmoeConfig
     """
     def __init__(self, config: LlamoeConfig):
@@ -1229,25 +1225,7 @@ class LlamoeForCausalLM(LlammoePreTrainedModel):
         output_router_logits: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-        Returns:
-        Example:
-        ```python
-        >>> from transformers import AutoTokenizer, GemmoeForCausalLM
-        >>> model = GemmoeForCausalLM.from_pretrained("mistralai/Gemmoe-8x7B-v0.1")
-        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Gemmoe-8x7B-v0.1")
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (

         return attn_output, None, past_key_value
+LLAMOE_ATTENTION_CLASSES = {
     "eager": LlamoeAttention,
     "flash_attention_2": LlamoeFlashAttention2,
     "sdpa": LlamoeSdpaAttention,
+LLAMOE_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
 @add_start_docstrings(
+    "The bare Llamoe Model outputting raw hidden-states without any specific head on top.",
+    LLAMOE_START_DOCSTRING,
 )
 class LlammoePreTrainedModel(PreTrainedModel):
             layer.self_attn.past_key_value = None
+LLAMOE_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 @add_start_docstrings(
     "The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
+    LLAMOE_START_DOCSTRING,
 )
 class LlamoeModel(LlammoePreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamoeDecoderLayer`]
     Args:
+        config: LlamoeConfig
     """
     def __init__(self, config: LlamoeConfig):
         output_router_logits: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (