Update modeling_Llamoe.py
Browse files- modeling_Llamoe.py +9 -31
modeling_Llamoe.py
CHANGED
@@ -747,7 +747,7 @@ class LlamoeSdpaAttention(LlamoeAttention):
|
|
747 |
return attn_output, None, past_key_value
|
748 |
|
749 |
|
750 |
-
|
751 |
"eager": LlamoeAttention,
|
752 |
"flash_attention_2": LlamoeFlashAttention2,
|
753 |
"sdpa": LlamoeSdpaAttention,
|
@@ -833,7 +833,7 @@ class LlamoeDecoderLayer(nn.Module):
|
|
833 |
|
834 |
|
835 |
|
836 |
-
|
837 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
838 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
839 |
etc.)
|
@@ -851,12 +851,8 @@ LLAMA_START_DOCSTRING = r"""
|
|
851 |
|
852 |
|
853 |
@add_start_docstrings(
|
854 |
-
"The bare
|
855 |
-
|
856 |
-
)
|
857 |
-
@add_start_docstrings(
|
858 |
-
"The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
|
859 |
-
GEMMOE_START_DOCSTRING,
|
860 |
)
|
861 |
|
862 |
class LlammoePreTrainedModel(PreTrainedModel):
|
@@ -903,7 +899,7 @@ class LlammoePreTrainedModel(PreTrainedModel):
|
|
903 |
layer.self_attn.past_key_value = None
|
904 |
|
905 |
|
906 |
-
|
907 |
Args:
|
908 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
909 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
@@ -967,14 +963,14 @@ GEMMOE_INPUTS_DOCSTRING = r"""
|
|
967 |
|
968 |
@add_start_docstrings(
|
969 |
"The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
|
970 |
-
|
971 |
)
|
972 |
|
973 |
class LlamoeModel(LlammoePreTrainedModel):
|
974 |
"""
|
975 |
-
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`
|
976 |
Args:
|
977 |
-
config:
|
978 |
"""
|
979 |
|
980 |
def __init__(self, config: LlamoeConfig):
|
@@ -1229,25 +1225,7 @@ class LlamoeForCausalLM(LlammoePreTrainedModel):
|
|
1229 |
output_router_logits: Optional[bool] = None,
|
1230 |
return_dict: Optional[bool] = None,
|
1231 |
) -> Union[Tuple, MoeCausalLMOutputWithPast]:
|
1232 |
-
|
1233 |
-
Args:
|
1234 |
-
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1235 |
-
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
1236 |
-
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1237 |
-
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
1238 |
-
Returns:
|
1239 |
-
Example:
|
1240 |
-
```python
|
1241 |
-
>>> from transformers import AutoTokenizer, GemmoeForCausalLM
|
1242 |
-
>>> model = GemmoeForCausalLM.from_pretrained("mistralai/Gemmoe-8x7B-v0.1")
|
1243 |
-
>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Gemmoe-8x7B-v0.1")
|
1244 |
-
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
1245 |
-
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
1246 |
-
>>> # Generate
|
1247 |
-
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
1248 |
-
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
1249 |
-
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
|
1250 |
-
```"""
|
1251 |
|
1252 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
1253 |
output_router_logits = (
|
|
|
747 |
return attn_output, None, past_key_value
|
748 |
|
749 |
|
750 |
+
LLAMOE_ATTENTION_CLASSES = {
|
751 |
"eager": LlamoeAttention,
|
752 |
"flash_attention_2": LlamoeFlashAttention2,
|
753 |
"sdpa": LlamoeSdpaAttention,
|
|
|
833 |
|
834 |
|
835 |
|
836 |
+
LLAMOE_START_DOCSTRING = r"""
|
837 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
838 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
839 |
etc.)
|
|
|
851 |
|
852 |
|
853 |
@add_start_docstrings(
|
854 |
+
"The bare Llamoe Model outputting raw hidden-states without any specific head on top.",
|
855 |
+
LLAMOE_START_DOCSTRING,
|
|
|
|
|
|
|
|
|
856 |
)
|
857 |
|
858 |
class LlammoePreTrainedModel(PreTrainedModel):
|
|
|
899 |
layer.self_attn.past_key_value = None
|
900 |
|
901 |
|
902 |
+
LLAMOE_INPUTS_DOCSTRING = r"""
|
903 |
Args:
|
904 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
905 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
|
963 |
|
964 |
@add_start_docstrings(
|
965 |
"The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
|
966 |
+
LLAMOE_START_DOCSTRING,
|
967 |
)
|
968 |
|
969 |
class LlamoeModel(LlammoePreTrainedModel):
|
970 |
"""
|
971 |
+
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamoeDecoderLayer`]
|
972 |
Args:
|
973 |
+
config: LlamoeConfig
|
974 |
"""
|
975 |
|
976 |
def __init__(self, config: LlamoeConfig):
|
|
|
1225 |
output_router_logits: Optional[bool] = None,
|
1226 |
return_dict: Optional[bool] = None,
|
1227 |
) -> Union[Tuple, MoeCausalLMOutputWithPast]:
|
1228 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1229 |
|
1230 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
1231 |
output_router_logits = (
|