File size: 896 Bytes
7bd5a1f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
import spacy, re
from spacy.tokenizer import Tokenizer
"""
python -m spacy package general_en packages -c /Users/nss/sefaria/project/sefaria/spacy_function_registry.py -b wheel,sdist -n torah_ner -v 1.0.0
"""
@spacy.registry.tokenizers("inner_punct_tokenizer")
def inner_punct_tokenizer_factory():
def inner_punct_tokenizer(nlp):
# infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
infix_re = re.compile(r'''[.,?!:;…‘’`“”"'~–\-/()<>]''')
prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=None)
return inner_punct_tokenizer |