|
class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast): |
|
def __init__(self, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
|
|
def prepare_for_model( |
|
self, |
|
ids, |
|
pair_ids=None, |
|
add_special_tokens=True, |
|
padding=False, |
|
truncation=False, |
|
max_length=None, |
|
stride=0, |
|
pad_to_multiple_of=None, |
|
return_tensors=None, |
|
return_token_type_ids=None, |
|
return_attention_mask=None, |
|
return_overflowing_tokens=False, |
|
return_special_tokens_mask=False, |
|
return_offsets_mapping=False, |
|
return_length=False, |
|
verbose=True, |
|
prepend_batch_axis=False, |
|
**kwargs |
|
): |
|
breakpoint() |
|
if add_special_tokens and self.eos_token_id in ids: |
|
ids = [id for id in ids if id != self.eos_token_id] |
|
|
|
return super().prepare_for_model( |
|
ids, |
|
pair_ids=pair_ids, |
|
add_special_tokens=add_special_tokens, |
|
padding=padding, |
|
truncation=truncation, |
|
max_length=max_length, |
|
stride=stride, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_tensors=return_tensors, |
|
return_token_type_ids=return_token_type_ids, |
|
return_attention_mask=return_attention_mask, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_offsets_mapping=return_offsets_mapping, |
|
return_length=return_length, |
|
verbose=verbose, |
|
prepend_batch_axis=prepend_batch_axis, |
|
**kwargs |
|
) |