Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
# @Time : 2021/12/8 12:07 a.m. | |
# @Author : JianingWang | |
# @File : JiebaTokenizer | |
import jieba | |
from transformers import BertTokenizer | |
class JiebaTokenizer(BertTokenizer): | |
def __init__( | |
self, pre_tokenizer=lambda x: jieba.cut(x, HMM=False), *args, **kwargs | |
): | |
super().__init__(*args, **kwargs) | |
self.pre_tokenizer = pre_tokenizer | |
def _tokenize(self, text, *arg, **kwargs): | |
split_tokens = [] | |
for text in self.pre_tokenizer(text): | |
if text in self.vocab: | |
split_tokens.append(text) | |
else: | |
split_tokens.extend(super()._tokenize(text)) | |
return split_tokens | |