import spacy | |
from spacy.tokenizer import Tokenizer | |
def create_custom_tokenizer(): | |
def create_tokenizer(nlp): | |
infixes = nlp.Defaults.infixes + [ | |
r"/", | |
r"-", | |
r",", | |
r":", | |
] | |
prefixes = nlp.Defaults.prefixes + [ | |
r"-", | |
] | |
prefix_regex = spacy.util.compile_prefix_regex(prefixes) | |
infix_regex = spacy.util.compile_infix_regex(infixes) | |
return Tokenizer( | |
nlp.vocab, | |
prefix_search=prefix_regex.search, | |
infix_finditer=infix_regex.finditer, | |
) | |
return create_tokenizer | |