Update README.md
Browse files
README.md
CHANGED
@@ -35,7 +35,24 @@ import logging
|
|
35 |
from typing import List, Tuple
|
36 |
from transformers import AutoConfig
|
37 |
from transformers.models.mt5.modeling_mt5 import MT5ForConditionalGeneration
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
def load_model(model_path):
|
41 |
config = AutoConfig.from_pretrained(model_path)
|
|
|
35 |
from typing import List, Tuple
|
36 |
from transformers import AutoConfig
|
37 |
from transformers.models.mt5.modeling_mt5 import MT5ForConditionalGeneration
|
38 |
+
|
39 |
+
import jieba
|
40 |
+
from functools import partial
|
41 |
+
from transformers import BertTokenizer
|
42 |
+
|
43 |
+
class T5PegasusTokenizer(BertTokenizer):
|
44 |
+
def __init__(self, *args, **kwargs):
|
45 |
+
super().__init__(*args, **kwargs)
|
46 |
+
self.pre_tokenizer = partial(jieba.cut, HMM=False)
|
47 |
+
|
48 |
+
def _tokenize(self, text, *arg, **kwargs):
|
49 |
+
split_tokens = []
|
50 |
+
for text in self.pre_tokenizer(text):
|
51 |
+
if text in self.vocab:
|
52 |
+
split_tokens.append(text)
|
53 |
+
else:
|
54 |
+
split_tokens.extend(super()._tokenize(text))
|
55 |
+
return split_tokens
|
56 |
|
57 |
def load_model(model_path):
|
58 |
config = AutoConfig.from_pretrained(model_path)
|