sana-zero / zerogpu /gemma2.txt
gen6scp's picture
Patched codes for ZeroGPU
d643072
718
719 @add_start_docstrings(
720 "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
721 GEMMA2_START_DOCSTRING,
722 )
723 class Gemma2Model(Gemma2PreTrainedModel):
724 """
725 Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Gemma2DecoderLayer`]
726
727 Args:
728 config: Gemma2Config
729 """
730
731 def __init__(self, config: Gemma2Config):
732 super().__init__(config)
733 self.padding_idx = config.pad_token_id
734 self.vocab_size = config.vocab_size
735
736 self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
737 self.layers = nn.ModuleList(
738 [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
739 )
740 self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
741 self.gradient_checkpointing = False
742
743 # Initialize weights and apply final processing
744 self.post_init()
745
746 def get_input_embeddings(self):
747 return self.embed_tokens
748
749 def set_input_embeddings(self, value):
750 self.embed_tokens = value
751
752 @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
753 def forward(
754 self,
755 input_ids: torch.LongTensor = None,
756 attention_mask: Optional[torch.Tensor] = None,
757 position_ids: Optional[torch.LongTensor] = None,
758 past_key_values: Optional[HybridCache] = None,
759 inputs_embeds: Optional[torch.FloatTensor] = None,
760 use_cache: Optional[bool] = None,
761 output_attentions: Optional[bool] = None,
762 output_hidden_states: Optional[bool] = None,
763 return_dict: Optional[bool] = None,
764 cache_position: Optional[torch.LongTensor] = None,
765 ) -> Union[Tuple, BaseModelOutputWithPast]:
766 output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
767 output_hidden_states = (
768 output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
769 )
770 use_cache = use_cache if use_cache is not None else self.config.use_cache
771 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
772
773 if (input_ids is None) ^ (inputs_embeds is not None):
774 raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
775
776 if self.gradient_checkpointing and self.training and use_cache:
777 logger.warning_once(
778 "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
779 )
780 use_cache = False
781
782 if inputs_embeds is None:
783 inputs_embeds = self.embed_tokens(input_ids)
784
785 if use_cache and past_key_values is None and not self.training:
786 batch_size, seq_len, _ = inputs_embeds.shape
787 past_key_values = HybridCache(
788 self.config,
789 batch_size=batch_size,
790 max_cache_len=seq_len,
791 device=self.device,
792 dtype=inputs_embeds.dtype,
793 )
794
795 if cache_position is None:
796 past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
797 cache_position = torch.arange(
798 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
799 )
800
801 if position_ids is None:
802 position_ids = cache_position.unsqueeze(0)
803
804 causal_mask = self._update_causal_mask(
805 attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
806 )
807
808 # embed positions
809 hidden_states = inputs_embeds
810
811 # normalized
812 # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
813 # See https://github.com/huggingface/transformers/pull/29402
814 normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
815 hidden_states = hidden_states * normalizer
816
817 # decoder layers
818 all_hidden_states = () if output_hidden_states else None
819 all_self_attns = () if output_attentions else None
820
821 for decoder_layer in self.layers:
822 if output_hidden_states:
823 all_hidden_states += (hidden_states,)
824
825 if self.gradient_checkpointing and self.training:
826 layer_outputs = self._gradient_checkpointing_func(
827 decoder_layer.__call__,
828 hidden_states,
829 causal_mask,
830 position_ids,
831 past_key_values,
832 output_attentions,
833 use_cache,
834 cache_position,
835 )
836 else:
837 layer_outputs = decoder_layer(
838 hidden_states,
839 attention_mask=causal_mask,
840 position_ids=position_ids,
841 past_key_value=past_key_values,
842 output_attentions=output_attentions,
843 use_cache=use_cache,
844 cache_position=cache_position,
845 )
846
847 hidden_states = layer_outputs[0]
848
849 if output_attentions:
850 all_self_attns += (layer_outputs[1],)
851
852 hidden_states = self.norm(hidden_states)
853
854 if output_hidden_states:
855 all_hidden_states += (hidden_states,)
856
857 next_cache = past_key_values if use_cache else None
858
859 if not return_dict:
860 return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
861 return BaseModelOutputWithPast(
862 last_hidden_state=hidden_states,
863 past_key_values=next_cache,
864 hidden_states=all_hidden_states,
865 attentions=all_self_attns,
866 )
867
868 def _update_causal_mask(
869 self,
870 attention_mask: torch.Tensor,
871 input_tensor: torch.Tensor,
872 cache_position: torch.Tensor,
873 past_key_values: HybridCache,
874 output_attentions: bool,
875 ):
876 # Flash Attention currently doesn't support static cache but Gemma2 work only with static cache.
877 # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape
878 # to cut out keys/values trailing 0 used in static cache. This workaround should be compile compatible
879 # as it doesn't cause dynamic control issues.
880 if self.config._attn_implementation == "flash_attention_2":
881 return attention_mask
882
883 dtype, device = input_tensor.dtype, input_tensor.device
884 sequence_length = input_tensor.shape[1]
885 if isinstance(past_key_values, HybridCache):
886 target_length = past_key_values.get_max_cache_shape()
887 else:
888 target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]
889
890 # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
891 causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
892 attention_mask,
893 sequence_length=sequence_length,
894 target_length=target_length,
895 dtype=dtype,
896 device=device,
897 cache_position=cache_position,
898 batch_size=input_tensor.shape[0],
899 )
900 return causal_mask
901
902 @staticmethod
903 def _prepare_4d_causal_attention_mask_with_cache_position(
904 attention_mask: torch.Tensor,
905 sequence_length: int,
906 target_length: int,
907 dtype: torch.dtype,
908 device: torch.device,
909 cache_position: torch.Tensor,
910 batch_size: int,
911 **kwargs,
912 ):
913 """
914 Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
915 `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
916
917 Args:
918 attention_mask (`torch.Tensor`):
919 A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
920 `(batch_size, 1, query_length, key_value_length)`.
921 sequence_length (`int`):
922 The sequence length being processed.
923 target_length (`int`):
924 The target length: when generating with static cache, the mask should be as long as the static cache,
925 to account for the 0 padding, the part of the cache that is not filled yet.
926 dtype (`torch.dtype`):
927 The dtype to use for the 4D attention mask.
928 device (`torch.device`):
929 The device to plcae the 4D attention mask on.
930 cache_position (`torch.Tensor`):
931 Indices depicting the position of the input sequence tokens in the sequence.
932 batch_size (`torch.Tensor`):
933 Batch size.
934 """
935 if attention_mask is not None and attention_mask.dim() == 4:
936 # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
937 causal_mask = attention_mask
938 else:
939 min_dtype = torch.finfo(dtype).min
940 causal_mask = torch.full(
941 (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
942 )
943 if sequence_length != 1:
944 causal_mask = torch.triu(causal_mask, diagonal=1)
945 causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
946 causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
947 if attention_mask is not None:
948 causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
949 mask_length = attention_mask.shape[-1]
950 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
951 padding_mask = padding_mask == 0
952 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
953 padding_mask, min_dtype
954 )
955
956 return causal_mask
957
958
959 class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
960 _tied_weights_keys = ["lm_head.weight"]
961
962 def __init__(self, config):
963 super().__init__(config)
964 self.model = Gemma2Model(config)
965 self.vocab_size = config.vocab_size
966 self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
967
968 # Initialize weights and apply final processing
969 self.post_init()
970
971 def get_input_embeddings(self):
972 return self.model.embed_tokens
973
974 def set_input_embeddings(self, value):
975 self.model.embed_tokens = value
976
977 def get_output_embeddings(self):
978 return self.lm_head
979
980 def set_output_embeddings(self, new_embeddings):
981 self.lm_head = new_embeddings
982
983 def set_decoder(self, decoder):
984 self.model = decoder
985
986 def get_decoder(self):
987 return self.model
988
989 @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
990 @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
991 def forward(
992 self,
993 input_ids: torch.LongTensor = None,
994 attention_mask: Optional[torch.Tensor] = None,
995 position_ids: Optional[torch.LongTensor] = None,
996 past_key_values: Optional[HybridCache] = None,
997 inputs_embeds: Optional[torch.FloatTensor] = None,
998 labels: Optional[torch.LongTensor] = None,
999 use_cache: Optional[bool] = None,
1000 output_attentions: Optional[bool] = None,
1001 output_hidden_states: Optional[bool] = None,
1002 return_dict: Optional[bool] = None,
1003 cache_position: Optional[torch.LongTensor] = None,
1004 num_logits_to_keep: int = 0,
1005 **loss_kwargs,
1006 ) -> Union[Tuple, CausalLMOutputWithPast]:
1007 r"""
1008 Args:
1009 labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1010 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1011 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1012 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1013
1014 num_logits_to_keep (`int`, *optional*):
1015 Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1016 `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1017 token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1018
1019 Returns:
1020
1021 Example:
1022
1023 ```python
1024 >>> from transformers import AutoTokenizer, GemmaForCausalLM
1025
1026 >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
1027 >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
1028
1029 >>> prompt = "What is your favorite condiment?"
1030 >>> inputs = tokenizer(prompt, return_tensors="pt")
1031
1032 >>> # Generate
1033 >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1034 >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1035 "What is your favorite condiment?"
1036 ```"""
1037
1038 if self.training and self.config._attn_implementation != "eager":
1039 logger.warning_once(
1040 "It is strongly recommended to train Gemma2 models with the `eager` attention implementation "
1041 f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
1042 )
1043 output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1044 output_hidden_states = (
1045 output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1046 )
1047 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1048 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1049 outputs = self.model(
1050 input_ids=input_ids,
1051 attention_mask=attention_mask,
1052 position_ids=position_ids,
1053 past_key_values=past_key_values,
1054 inputs_embeds=inputs_embeds,
1055 use_cache=use_cache,
1056 output_attentions=output_attentions,
1057 output_hidden_states=output_hidden_states,
1058 return_dict=return_dict,
1059 cache_position=cache_position,
1060 )
1061
1062 hidden_states = outputs[0]
1063 # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1064 logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
1065 if self.config.final_logit_softcapping is not None:
1066 logits = logits / self.config.final_logit_softcapping
1067 logits = torch.tanh(logits)
1068 logits = logits * self.config.final_logit_softcapping
1069
1070 loss = None
1071 if labels is not None:
1072 loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
1073
1074 if not return_dict:
1075 output = (logits,) + outputs[1:]
1076 return (loss,) + output if loss is not None else output
1077
1078 return CausalLMOutputWithPast(
1079 loss=loss,
1080 logits=logits,
1081 past_key_values=outputs.past_key_values,
1082 hidden_states=outputs.hidden_states,
1083 attentions=outputs.attentions,
1084 )
1085
1086 def prepare_inputs_for_generation(
1087 self,
1088 input_ids,
1089 past_key_values=None,
1090 attention_mask=None,
1091 inputs_embeds=None,
1092 cache_position=None,
1093 position_ids=None,
1094 use_cache=True,
1095 num_logits_to_keep=None,
1096 **kwargs,
1097 ):
1098 # Overwritten: has a special cache type, `HybridCache`
1099
1100 # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
1101 # Exception 1: when passing input_embeds, input_ids may be missing entries
1102 # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
1103 if past_key_values is not None:
1104 if inputs_embeds is not None: # Exception 1
1105 input_ids = input_ids[:, -cache_position.shape[0] :]
1106 elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
1107 input_ids = input_ids[:, cache_position]
1108 if attention_mask is not None and position_ids is None:
1109 # create position_ids on the fly for batch generation
1110 position_ids = attention_mask.long().cumsum(-1) - 1
1111 position_ids.masked_fill_(attention_mask == 0, 1)
1112 if past_key_values:
1113 position_ids = position_ids[:, -input_ids.shape[1] :]
1114 # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
1115 # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride
1116 # during the decoding. Here, simply using `.contiguous()` is not sufficient as in the
1117 # batch size = 1 case, `position_ids` is already contiguous but with varying stride
1118 # which retriggers a capture.
1119 position_ids = position_ids.clone(memory_format=torch.contiguous_format)
1120
1121 # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1122 if inputs_embeds is not None and cache_position[0] == 0:
1123 model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
1124 else:
1125 # The clone here is for the same reason as for `position_ids`.
1126 model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
1127
1128 if (
1129 isinstance(past_key_values, HybridCache)
1130 and attention_mask.ndim == 2
1131 and not self.config._attn_implementation == "flash_attention_2"
1132 ):
1133 if model_inputs["inputs_embeds"] is not None:
1134 batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
1135 device = model_inputs["inputs_embeds"].device
1136 else:
1137 batch_size, sequence_length = model_inputs["input_ids"].shape
1138 device = model_inputs["input_ids"].device
1139
1140 attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
1141 attention_mask,
1142 sequence_length=sequence_length,
1143 target_length=past_key_values.get_max_cache_shape(),
1144 dtype=self.lm_head.weight.dtype,
1145 device=device,
1146 cache_position=cache_position,
1147 batch_size=batch_size,
1148 )
1149
1150 if num_logits_to_keep is not None:
1151 model_inputs["num_logits_to_keep"] = num_logits_to_keep
1152
1153 model_inputs.update(
1154 {
1155 "position_ids": position_ids,
1156 "cache_position": cache_position,
1157 "past_key_values": past_key_values,
1158 "use_cache": use_cache,
1159 "attention_mask": attention_mask,
1160 }
1161 )
1162 return model_inputs
1163
1164
1165 @add_start_docstrings(
1166 """
1167 The Gemma2 Model transformer with a sequence classification head on top (linear layer).
1168
1169 [`Gemma2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1170 (e.g. GPT-2) do.
1171
1172 Since it does classification on the last token, it requires to know the position of the last token. If a
1173 `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1174 no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1175 padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1176 each row of the batch).
1177 """,
1178 GEMMA2_START_DOCSTRING,
1179 )
1180 class Gemma2ForSequenceClassification(Gemma2PreTrainedModel):
1181 def __init__(self, config):
1182 super().__init__(config)
1183 self.num_labels = config.num_labels
1184 self.model = Gemma2Model(config)
1185 self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1186
1187 # Initialize weights and apply final processing
1188 self.post_init()
1189
1190 def get_input_embeddings(self):
1191 return self.model.embed_tokens
1192
1193 def set_input_embeddings(self, value):
1194 self.model.embed_tokens = value
1195
1196 @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
1197 def forward(
1198 self,
1199 input_ids: Optional[torch.LongTensor] = None,
1200 attention_mask: Optional[torch.Tensor] = None,
1201 position_ids: Optional[torch.LongTensor] = None,
1202 past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1203 inputs_embeds: Optional[torch.FloatTensor] = None,
1204 labels: Optional[torch.LongTensor] = None,
1205 use_cache: Optional[bool] = None,
1206 output_attentions: Optional[bool] = None,
1207 output_hidden_states: Optional[bool] = None,
1208 return_dict: Optional[bool] = None,
1209 ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1210 r"""
1211 labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1212 Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1213 config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1214 `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1215 """
1216 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1217
1218 transformer_outputs = self.model(
1219 input_ids,
1220 attention_mask=attention_mask,
1221 position_ids=position_ids,
1222 past_key_values=past_key_values,
1223 inputs_embeds=inputs_embeds,
1224 use_cache=use_cache,
1225 output_attentions=output_attentions,
1226 output_hidden_states=output_hidden_states,
1227 return_dict=return_dict,
1228 )
1229 hidden_states = transformer_outputs[0]
1230 logits = self.score(hidden_states)
1231
1232 if input_ids is not None:
1233 batch_size = input_ids.shape[0]
1234 else:
1235 batch_size = inputs_embeds.shape[0]
1236
1237 if self.config.pad_token_id is None and batch_size != 1:
1238 raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1239 if self.config.pad_token_id is None:
1240 sequence_lengths = -1
1241 else:
1242 if input_ids is not None:
1243 # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1244 sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1245 sequence_lengths = sequence_lengths % input_ids.shape[-1]
1246 sequence_lengths = sequence_lengths.to(logits.device)
1247 else:
1248 sequence_lengths = -1
1249
1250 pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1251
1252 loss = None
1253 if labels is not None:
1254 loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
1255
1256 if not return_dict:
1257 output = (pooled_logits,) + transformer_outputs[1:]
1258 return ((loss,) + output) if loss is not None else output
1259
1260 return SequenceClassifierOutputWithPast(
1261 loss=loss,
1262 logits=pooled_logits,
1263 past_key_values=transformer_outputs.past_key_values,
1264 hidden_states=transformer_outputs.hidden_states,
1265 attentions=transformer_outputs.attentions,
1266 )
1267
1268
1269 @add_start_docstrings(
1270 """
1271 The Gemma2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1272 output) e.g. for Named-Entity-Recognition (NER) tasks.
1273 """,
1274 GEMMA2_START_DOCSTRING,
1275 )
1276 class Gemma2ForTokenClassification(Gemma2PreTrainedModel):
1277 def __init__(self, config):
1278 super().__init__(config)
1279 self.num_labels = config.num_labels
1280 self.model = Gemma2Model(config)
1281 if getattr(config, "classifier_dropout", None) is not None:
1282 classifier_dropout = config.classifier_dropout
1283 elif getattr(config, "hidden_dropout", None) is not None:
1284 classifier_dropout = config.hidden_dropout
1285 else:
1286 classifier_dropout = 0.1
1287 self.dropout = nn.Dropout(classifier_dropout)
1288 self.score = nn.Linear(config.hidden_size, config.num_labels)
1289
1290 # Initialize weights and apply final processing
1291 self.post_init()
1292
1293 def get_input_embeddings(self):
1294 return self.model.embed_tokens
1295
1296 def set_input_embeddings(self, value):
1297 self.model.embed_tokens = value
1298
1299 @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
1300 @add_code_sample_docstrings(
1301 checkpoint=_CHECKPOINT_FOR_DOC,
1302 output_type=TokenClassifierOutput,
1303 config_class=_CONFIG_FOR_DOC,
1304 )
1305 def forward(
1306 self,
1307 input_ids: Optional[torch.LongTensor] = None,
1308 attention_mask: Optional[torch.Tensor] = None,
1309 position_ids: Optional[torch.LongTensor] = None,
1310 past_key_values: Optional[List[torch.FloatTensor]] = None,
1311 inputs_embeds: Optional[torch.FloatTensor] = None,
1312 labels: Optional[torch.LongTensor] = None,
1313 use_cache: Optional[bool] = None,
1314 output_attentions: Optional[bool] = None,
1315 output_hidden_states: Optional[bool] = None,
1316 return_dict: Optional[bool] = None,
1317 ) -> Union[Tuple, TokenClassifierOutput]:
1318 r"""
1319 labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1320 Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1321 config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1322 `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1323 """
1324 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1325
1326 outputs = self.model(
1327 input_ids,
1328 attention_mask=attention_mask,
1329 position_ids=position_ids,
1330 past_key_values=past_key_values,
1331 inputs_embeds=inputs_embeds,
1332 use_cache=use_cache,
1333 output_attentions=output_attentions,
1334 output_hidden_states=output_hidden_states,
1335 return_dict=return_dict,
1336 )
1337 sequence_output = outputs[0]
1338 sequence_output = self.dropout(sequence_output)
1339 logits = self.score(sequence_output)
1340
1341 loss = None
1342 if labels is not None:
1343 loss = self.loss_function(logits, labels, self.config)
1344
1345 if not return_dict:
1346 output = (logits,) + outputs[2:]
1347 return ((loss,) + output) if loss is not None else output
1348
1349 return TokenClassifierOutput(
1350 loss=loss,
1351 logits=logits,
1352 hidden_states=outputs.hidden_states,
1353 attentions=outputs.attentions,
1354 )