Dy3257 commited on
Commit
c16f075
1 Parent(s): 6109610

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +12 -14
tokenizer.py CHANGED
@@ -4,18 +4,18 @@ subprocess.run(["pip", "install", "spacy"])
4
 
5
  import spacy
6
 
7
- #spacy.cli.download("en_core_web_sm")
8
 
9
- #from spacy.tokens import Doc
10
 
11
  # 加载英文模型
12
- #nlp = spacy.load('en_core_web_sm')
13
 
14
  import nltk
15
 
16
- #nltk.download('punkt')
17
 
18
- #from nltk.tokenize import word_tokenize
19
 
20
  import jieba
21
 
@@ -38,26 +38,24 @@ with codecs.open('model2_data/bpecode.en', 'r', 'utf-8') as f:
38
 
39
  def spacy_tokenize(line):
40
  # 使用spaCy处理文本
41
- #doc = nlp(line)
42
  # 获取单词列表
43
- #words = [token.text for token in doc]
44
  # 将单词连接成一个字符串,单词间用一个空格间隔
45
- #return ' '.join(words)
46
- return ""
47
 
48
 
49
  def nltk_tokenize(line):
50
  # 使用NLTK的word_tokenize进行分词
51
- #tokens = word_tokenize(line)
52
- #print(tokens)
53
- #return tokens
54
- return []
55
 
56
 
57
  def jieba_tokenize(line):
58
  # 使用jieba进行分词
59
  tokens = list(jieba1.cut(line.strip())) # strip用于去除可能的空白字符
60
- #print(tokens)
61
  return tokens
62
 
63
  def tokenize(line, mode):
 
4
 
5
  import spacy
6
 
7
+ spacy.cli.download("en_core_web_sm")
8
 
9
+ from spacy.tokens import Doc
10
 
11
  # 加载英文模型
12
+ nlp = spacy.load('en_core_web_sm')
13
 
14
  import nltk
15
 
16
+ nltk.download('punkt')
17
 
18
+ from nltk.tokenize import word_tokenize
19
 
20
  import jieba
21
 
 
38
 
39
  def spacy_tokenize(line):
40
  # 使用spaCy处理文本
41
+ doc = nlp(line)
42
  # 获取单词列表
43
+ words = [token.text for token in doc]
44
  # 将单词连接成一个字符串,单词间用一个空格间隔
45
+ return ' '.join(words)
 
46
 
47
 
48
  def nltk_tokenize(line):
49
  # 使用NLTK的word_tokenize进行分词
50
+ tokens = word_tokenize(line)
51
+
52
+ return tokens
 
53
 
54
 
55
  def jieba_tokenize(line):
56
  # 使用jieba进行分词
57
  tokens = list(jieba1.cut(line.strip())) # strip用于去除可能的空白字符
58
+
59
  return tokens
60
 
61
  def tokenize(line, mode):