X1A
/

Chinese
X1A commited on
Commit
8d8e03d
1 Parent(s): 4681f9e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +18 -1
README.md CHANGED
@@ -35,7 +35,24 @@ import logging
35
  from typing import List, Tuple
36
  from transformers import AutoConfig
37
  from transformers.models.mt5.modeling_mt5 import MT5ForConditionalGeneration
38
- from utils import T5PegasusTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def load_model(model_path):
41
  config = AutoConfig.from_pretrained(model_path)
 
35
  from typing import List, Tuple
36
  from transformers import AutoConfig
37
  from transformers.models.mt5.modeling_mt5 import MT5ForConditionalGeneration
38
+
39
+ import jieba
40
+ from functools import partial
41
+ from transformers import BertTokenizer
42
+
43
+ class T5PegasusTokenizer(BertTokenizer):
44
+ def __init__(self, *args, **kwargs):
45
+ super().__init__(*args, **kwargs)
46
+ self.pre_tokenizer = partial(jieba.cut, HMM=False)
47
+
48
+ def _tokenize(self, text, *arg, **kwargs):
49
+ split_tokens = []
50
+ for text in self.pre_tokenizer(text):
51
+ if text in self.vocab:
52
+ split_tokens.append(text)
53
+ else:
54
+ split_tokens.extend(super()._tokenize(text))
55
+ return split_tokens
56
 
57
  def load_model(model_path):
58
  config = AutoConfig.from_pretrained(model_path)