Spaces:
Running
Running
from transformers import BertTokenizer, BasicTokenizer | |
from transformers.tokenization_utils import _is_punctuation | |
class OurBasicTokenizer(BasicTokenizer): | |
def _run_split_on_punc(self, text, never_split=None): | |
"""Splits punctuation on a piece of text.""" | |
if text in self.never_split or (never_split and text in never_split): | |
return [text] | |
chars = list(text) | |
i = 0 | |
start_new_word = True | |
output = [] | |
while i < len(chars): | |
char = chars[i] | |
if _is_punctuation(char) and char != "'" and not (char == '"' and i + 1 < len(chars) and not _is_punctuation(chars[i + 1])): | |
output.append([char]) | |
start_new_word = True | |
else: | |
if start_new_word: | |
output.append([]) | |
start_new_word = False | |
output[-1].append(char) | |
i += 1 | |
return ["".join(x) for x in output] | |
def RabbinicTokenizer(tok): | |
tok.basic_tokenizer = OurBasicTokenizer(tok.basic_tokenizer.do_lower_case, tok.basic_tokenizer.never_split) | |
return tok | |