lsy641 commited on
Commit
c4f50af
1 Parent(s): 9541183
Files changed (1) hide show
  1. tokenizer_13a.py +2 -0
tokenizer_13a.py CHANGED
@@ -67,6 +67,7 @@ class TokenizerRegexp(BaseTokenizer):
67
  # no leading or trailing spaces, single space within words
68
  # return ' '.join(line.split())
69
  # This line is changed with regards to the original tokenizer (seen above) to return individual words
 
70
  return line.split()
71
 
72
 
@@ -96,6 +97,7 @@ class Tokenizer13a(BaseTokenizer):
96
  line = line.replace("&", "&")
97
  line = line.replace("&lt;", "<")
98
  line = line.replace("&gt;", ">")
 
99
 
100
  return self._post_tokenizer(f" {line} ")
101
 
 
67
  # no leading or trailing spaces, single space within words
68
  # return ' '.join(line.split())
69
  # This line is changed with regards to the original tokenizer (seen above) to return individual words
70
+ print(line)
71
  return line.split()
72
 
73
 
 
97
  line = line.replace("&amp;", "&")
98
  line = line.replace("&lt;", "<")
99
  line = line.replace("&gt;", ">")
100
+ print(line)
101
 
102
  return self._post_tokenizer(f" {line} ")
103