import spacy, re from spacy.tokenizer import Tokenizer """ python -m spacy package general_en packages -c /Users/nss/sefaria/project/sefaria/spacy_function_registry.py -b wheel,sdist -n torah_ner -v 1.0.0 """ @spacy.registry.tokenizers("inner_punct_tokenizer") def inner_punct_tokenizer_factory(): def inner_punct_tokenizer(nlp): # infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes) infix_re = re.compile(r'''[.,?!:;…‘’`“”"'~–\-/()<>]''') prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None) return inner_punct_tokenizer