en_tech_model / custom_functions.py
magepol's picture
Update spaCy pipeline
556c5ed verified
raw
history blame contribute delete
671 Bytes
import spacy
from spacy.tokenizer import Tokenizer
@spacy.registry.tokenizers("custom_tokenizer")
def create_custom_tokenizer():
def create_tokenizer(nlp):
infixes = nlp.Defaults.infixes + [
r"/",
r"-",
r",",
r":",
]
prefixes = nlp.Defaults.prefixes + [
r"-",
]
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
infix_regex = spacy.util.compile_infix_regex(infixes)
return Tokenizer(
nlp.vocab,
prefix_search=prefix_regex.search,
infix_finditer=infix_regex.finditer,
)
return create_tokenizer