Spaces:

ofig
/

live-lm-critic

Runtime error

live-lm-critic / utils /spacy_tokenizer.py

Olivia Figueira

Upload code with streamlit addition

b6e5241 about 3 years ago

2.33 kB

	import spacy
	from spacy.tokenizer import Tokenizer
	from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS, HYPHENS
	from spacy.util import compile_infix_regex
	from spacy.lang.en import English
	nlp = English()

	def get_tokenizer_gec(nlp):
	infixes = (
	LIST_ELLIPSES
	+ LIST_ICONS
	+ [
	r"(?<=[0-9])[+\-\*^](?=[0-9-])",
	r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
	al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
	),
	r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
	#r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
	r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
	]
	)
	infix_re = compile_infix_regex(infixes)
	return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
	suffix_search=nlp.tokenizer.suffix_search,
	infix_finditer=infix_re.finditer,
	token_match=nlp.tokenizer.token_match,
	rules=nlp.Defaults.tokenizer_exceptions)


	def get_tokenizer_bea19(nlp):
	infixes = (
	LIST_ELLIPSES
	+ LIST_ICONS
	+ [
	r"(?<=[0-9])[+\-\*^](?=[0-9-])",
	r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
	al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
	),
	r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
	r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
	r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
	]
	)
	infix_re = compile_infix_regex(infixes)
	return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
	suffix_search=nlp.tokenizer.suffix_search,
	infix_finditer=infix_re.finditer,
	token_match=nlp.tokenizer.token_match,
	rules=nlp.Defaults.tokenizer_exceptions)


	tokenizer_gec = get_tokenizer_gec(nlp)
	tokenizer_bea19 = get_tokenizer_bea19(nlp)


	def spacy_tokenize_gec(text):
	nlp.tokenizer = tokenizer_gec
	return [str(w) for w in nlp(text)]

	def spacy_tokenize_bea19(text):
	nlp.tokenizer = tokenizer_bea19
	return [str(w) for w in nlp(text)]