distinct
Browse files- tokenizer_13a.py +1 -3
tokenizer_13a.py
CHANGED
@@ -67,8 +67,7 @@ class TokenizerRegexp(BaseTokenizer):
|
|
67 |
# no leading or trailing spaces, single space within words
|
68 |
# return ' '.join(line.split())
|
69 |
# This line is changed with regards to the original tokenizer (seen above) to return individual words
|
70 |
-
|
71 |
-
print("1:", line.split())
|
72 |
return line.split()
|
73 |
|
74 |
|
@@ -98,7 +97,6 @@ class Tokenizer13a(BaseTokenizer):
|
|
98 |
line = line.replace("&", "&")
|
99 |
line = line.replace("<", "<")
|
100 |
line = line.replace(">", ">")
|
101 |
-
print(line)
|
102 |
|
103 |
return self._post_tokenizer(f" {line} ")
|
104 |
|
|
|
67 |
# no leading or trailing spaces, single space within words
|
68 |
# return ' '.join(line.split())
|
69 |
# This line is changed with regards to the original tokenizer (seen above) to return individual words
|
70 |
+
|
|
|
71 |
return line.split()
|
72 |
|
73 |
|
|
|
97 |
line = line.replace("&", "&")
|
98 |
line = line.replace("<", "<")
|
99 |
line = line.replace(">", ">")
|
|
|
100 |
|
101 |
return self._post_tokenizer(f" {line} ")
|
102 |
|