File size: 1,697 Bytes
08d8d00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def prepare_for_model(
self,
ids,
pair_ids=None,
add_special_tokens=True,
padding=False,
truncation=False,
max_length=None,
stride=0,
pad_to_multiple_of=None,
return_tensors=None,
return_token_type_ids=None,
return_attention_mask=None,
return_overflowing_tokens=False,
return_special_tokens_mask=False,
return_offsets_mapping=False,
return_length=False,
verbose=True,
prepend_batch_axis=False,
**kwargs
):
breakpoint()
if add_special_tokens and self.eos_token_id in ids:
ids = [id for id in ids if id != self.eos_token_id]
return super().prepare_for_model(
ids,
pair_ids=pair_ids,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
prepend_batch_axis=prepend_batch_axis,
**kwargs
) |