Update modeling_Llamoe.py

Files changed (1) hide show

modeling_Llamoe.py CHANGED Viewed

@@ -467,7 +467,7 @@ class LlamoeAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class LlamoeFlashAttention2(LlamaAttention):
     """
     Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
@@ -662,7 +662,7 @@ class LlamoeFlashAttention2(LlamaAttention):
         )
-class LlamoeSdpaAttention(LlamaAttention):
     """
     Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
     `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
@@ -970,7 +970,7 @@ GEMMOE_INPUTS_DOCSTRING = r"""
     GEMMOE_START_DOCSTRING,
 )
-class LlamoeModel(GemmoePreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GemmoeDecoderLayer`]
     Args:
@@ -1180,7 +1180,7 @@ class LlamoeModel(GemmoePreTrainedModel):
         return causal_mask
-class LlamoeForCausalLM(GemmoePreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):

         return attn_output, attn_weights, past_key_value
+class LlamoeFlashAttention2(LlamoeAttention):
     """
     Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
         )
+class LlamoeSdpaAttention(LlamoeAttention):
     """
     Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
     `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     GEMMOE_START_DOCSTRING,
 )
+class LlamoeModel(LlammoePreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GemmoeDecoderLayer`]
     Args:
         return causal_mask
+class LlamoeForCausalLM(LlammoePreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):