distinct
Browse files- tokenizer_13a.py +2 -0
tokenizer_13a.py
CHANGED
@@ -67,6 +67,7 @@ class TokenizerRegexp(BaseTokenizer):
|
|
67 |
# no leading or trailing spaces, single space within words
|
68 |
# return ' '.join(line.split())
|
69 |
# This line is changed with regards to the original tokenizer (seen above) to return individual words
|
|
|
70 |
return line.split()
|
71 |
|
72 |
|
@@ -96,6 +97,7 @@ class Tokenizer13a(BaseTokenizer):
|
|
96 |
line = line.replace("&", "&")
|
97 |
line = line.replace("<", "<")
|
98 |
line = line.replace(">", ">")
|
|
|
99 |
|
100 |
return self._post_tokenizer(f" {line} ")
|
101 |
|
|
|
67 |
# no leading or trailing spaces, single space within words
|
68 |
# return ' '.join(line.split())
|
69 |
# This line is changed with regards to the original tokenizer (seen above) to return individual words
|
70 |
+
print(line)
|
71 |
return line.split()
|
72 |
|
73 |
|
|
|
97 |
line = line.replace("&", "&")
|
98 |
line = line.replace("<", "<")
|
99 |
line = line.replace(">", ">")
|
100 |
+
print(line)
|
101 |
|
102 |
return self._post_tokenizer(f" {line} ")
|
103 |
|