vietnamese-bi-encoder / custom_tokenizer.py
phamson02
add word segmentation before tokenization
c1d85a2
raw
history blame
325 Bytes
from transformers import PhobertTokenizer
from pyvi import ViTokenizer
class CustomPhobertTokenizer(PhobertTokenizer):
def rdr_segment(self, text):
return ViTokenizer.tokenize(text)
def _tokenize(self, text):
segmented_text = self.rdr_segment(text)
return super()._tokenize(segmented_text)