File size: 3,072 Bytes
dfe96b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
"""
@file : tokenization.py
@author : xiaolu
@email : luxiaonlp@163.com
@time : 2022-02-28
"""
import jieba
from transformers import BasicTokenizer, BertTokenizer
class CustomBasicTokenizer(BasicTokenizer):
def __init__(self,
vocab,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None):
super().__init__(do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents)
self.vocab = vocab
def _tokenize_chinese_chars(self, text):
output = []
'''
1、输入一个句子s,用pre_tokenize先分一次词,得到[w1,w2,…,wl];
2、遍历各个wi,如果wi在词表中则保留,否则将wi用BERT自带的tokenize函数再分一次;
3、将每个wi的tokenize结果有序拼接起来,作为最后的tokenize结果。
'''
for wholeword in jieba.cut(text, HMM=False):
if wholeword in self.vocab:
output.append(" ")
output.append(wholeword)
output.append(" ")
else:
for char in wholeword:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
class WoBertTokenizer(BertTokenizer):
def __init__(self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs):
super().__init__(vocab_file,
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs)
if self.do_basic_tokenize:
self.basic_tokenizer = CustomBasicTokenizer(
vocab=self.vocab,
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
) |