import spacy from spacy.tokenizer import Tokenizer from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS, HYPHENS from spacy.util import compile_infix_regex from spacy.lang.en import English nlp = English() def get_tokenizer_gec(nlp): infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES ), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) infix_re = compile_infix_regex(infixes) return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, rules=nlp.Defaults.tokenizer_exceptions) def get_tokenizer_bea19(nlp): infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES ), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) infix_re = compile_infix_regex(infixes) return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, rules=nlp.Defaults.tokenizer_exceptions) tokenizer_gec = get_tokenizer_gec(nlp) tokenizer_bea19 = get_tokenizer_bea19(nlp) def spacy_tokenize_gec(text): nlp.tokenizer = tokenizer_gec return [str(w) for w in nlp(text)] def spacy_tokenize_bea19(text): nlp.tokenizer = tokenizer_bea19 return [str(w) for w in nlp(text)]