damerajee
/

hathi-moe-test

Text Generation

Mixture of Experts

damerajee/Gaja-v1.00

ai4bharat/Airavata

Model card Files Files and versions Community

damerajee commited on Mar 29, 2024

Commit

9dacccb

·

verified ·

1 Parent(s): fdf3b4d

Update modeling_Hixtral.py

Files changed (1) hide show

modeling_Hixtral.py +2 -2

modeling_Hixtral.py CHANGED Viewed

@@ -788,7 +788,7 @@ class HixtralDecoderLayer(nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = LLAMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.block_sparse_moe = HixtralSparseMoeBlock(config)
         self.input_layernorm = HixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -1020,7 +1020,7 @@ class HixtralModel(HixtralPreTrainedModel):
         self.embed_tokens = value
     # Ignore copy
-    @add_start_docstrings_to_model_forward(LLAMOE_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,

         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = HIXTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.block_sparse_moe = HixtralSparseMoeBlock(config)
         self.input_layernorm = HixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.embed_tokens = value
     # Ignore copy
+    @add_start_docstrings_to_model_forward(HIXTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,