Mingke977 commited on
Commit
8d31d53
·
verified ·
1 Parent(s): e416e67

Update modeling_deepseek.py

Browse files
Files changed (1) hide show
  1. modeling_deepseek.py +1 -5
modeling_deepseek.py CHANGED
@@ -22,7 +22,6 @@ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_u
22
  from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
23
  from transformers.processing_utils import Unpack
24
  from transformers.utils import (
25
- LossKwargs,
26
  add_start_docstrings,
27
  add_start_docstrings_to_model_forward,
28
  can_return_tuple,
@@ -903,9 +902,6 @@ class DeepseekV3Model(DeepseekV3PreTrainedModel):
903
  return causal_mask
904
 
905
 
906
- class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
907
-
908
-
909
  class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin):
910
  _tied_weights_keys = ["lm_head.weight"]
911
  _tp_plan = {"lm_head": "colwise_rep"}
@@ -955,7 +951,7 @@ class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin):
955
  output_hidden_states: Optional[bool] = None,
956
  cache_position: Optional[torch.LongTensor] = None,
957
  logits_to_keep: Union[int, torch.Tensor] = 0,
958
- **kwargs: Unpack[KwargsForCausalLM],
959
  ) -> CausalLMOutputWithPast:
960
  r"""
961
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
 
22
  from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
23
  from transformers.processing_utils import Unpack
24
  from transformers.utils import (
 
25
  add_start_docstrings,
26
  add_start_docstrings_to_model_forward,
27
  can_return_tuple,
 
902
  return causal_mask
903
 
904
 
 
 
 
905
  class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin):
906
  _tied_weights_keys = ["lm_head.weight"]
907
  _tp_plan = {"lm_head": "colwise_rep"}
 
951
  output_hidden_states: Optional[bool] = None,
952
  cache_position: Optional[torch.LongTensor] = None,
953
  logits_to_keep: Union[int, torch.Tensor] = 0,
954
+ **kwargs
955
  ) -> CausalLMOutputWithPast:
956
  r"""
957
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):