Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| import string | |
| from textsearch import TextSearch | |
| from contractions import contractions_dict, leftovers_dict | |
| ABBREVS = ( | |
| "a.m.", | |
| "adm.", | |
| "bros.", | |
| "co.", | |
| "corp.", | |
| "d.c.", | |
| "dr.", | |
| "e.g.", | |
| "gen.", | |
| "gov.", | |
| "i.e.", | |
| "inc.", | |
| "jr.", | |
| "ltd.", | |
| "md.", | |
| "messrs.", | |
| "mo.", | |
| "mont.", | |
| "mr.", | |
| "mrs.", | |
| "ms.", | |
| "p.m.", | |
| "ph.d.", | |
| "rep.", | |
| "rev.", | |
| "sen.", | |
| "st.", | |
| "vs.", | |
| ) | |
| class Tokenizer: | |
| def __init__( | |
| self, | |
| handle_http=False, | |
| handle_domains=False, | |
| numbers=True, | |
| combine_punctuation=True, | |
| eol="\n", | |
| currencies=("$",), | |
| protected_words=None, | |
| contractions=True, | |
| language="en", | |
| abbrevs=ABBREVS, | |
| ): | |
| # set() set() should fallback to just using __iter__ of automaton for a speedboost | |
| if language != "en" and contractions: | |
| raise ValueError("No contractions known for languages other than English.") | |
| self.contractions = contractions | |
| self.tokenizer = None | |
| self.handle_http = handle_http | |
| self.handle_domains = handle_domains | |
| self.combine_punctuation = combine_punctuation | |
| self.numbers = numbers | |
| self.eol = eol | |
| self.currencies = currencies or [] | |
| self.protected_words = protected_words or [] | |
| self.abbrevs = abbrevs | |
| self.explain_dict = {} | |
| self.setup() | |
| def setup(self): | |
| self.tokenizer = TextSearch("sensitive", "norm", set(), set()) | |
| self.add_base_cases() | |
| self.add_currencies() | |
| self.add_words(self.protected_words) | |
| if self.handle_http: | |
| self.tokenizer.add_http_handler(keep_result=True) | |
| for word in ["http://", "https://", "www."]: | |
| self.explain_dict[ | |
| word | |
| ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word) | |
| if self.handle_domains: | |
| self.add_domain_handler() | |
| if self.contractions: | |
| if self.contractions == True: | |
| self.contractions = {} | |
| self.contractions.update(contractions_dict) | |
| self.contractions.update(leftovers_dict) | |
| self.add_words(self.contractions) | |
| if self.abbrevs: | |
| self.add_words(self.abbrevs) | |
| def add_words(self, words): | |
| words = words.items() if isinstance(words, dict) else words | |
| if words and isinstance(words, (list, set, tuple)) and isinstance(words[0], str): | |
| words = [(x, x) for x in words] | |
| for x, y in words: | |
| REASON_AS_IS = "protected word: adds word as is, prevents splitting it." | |
| REASON_UPPER = "protected word: adds word uppercased, prevents splitting it." | |
| REASON_TITLE = "protected word: adds word titlecased, prevents splitting it." | |
| self.add(x, y, REASON_AS_IS) | |
| self.add(x.upper(), y.upper(), REASON_UPPER) | |
| if y: | |
| self.add(x[0].upper() + x[1:], y[0].upper() + y[1:], REASON_TITLE) | |
| def add_domain_handler(self): | |
| import re | |
| from tldextract.tldextract import TLD_EXTRACTOR | |
| valid_re = re.compile("^[a-zA-Z.]+$") | |
| tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)] | |
| for x in tlds: | |
| self.add(x, x, "Added by domain handler, keeps the token existing.") | |
| def add_base_cases(self): | |
| if self.numbers: | |
| for x in "0123456789": | |
| self.keep(x + ",") | |
| self.keep(x + ".") | |
| # self.tokenizer.add(" !", " ! ") | |
| if self.combine_punctuation: | |
| # combine multiples | |
| R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence." | |
| for s in "!.?-": | |
| for i in range(2, 10): | |
| # one of these is a splitting char | |
| if i == 1 and s == "-": | |
| continue | |
| c = s * i | |
| e = s * 3 if i > 1 else s | |
| # end = "$<EOS>$" if i == 1 or s != "-" else " " | |
| end = " \n" if i == 1 or s != "-" else " " | |
| self.add(c, " {}{}".format(e, end), R_COMBINE.format(c, e + end)) | |
| for i in range(2, 10): | |
| # self.tokenizer.add("\n" * i, "$<EOS>$") | |
| self.add("\n" * i, " \n ", "merges newlines") | |
| for s in "!.?-\n": | |
| self.add(s, " " + s + "\n", "Splits on '{}' and creating a new sentence.".format(s)) | |
| self.split("- ") | |
| self.split("...") | |
| # does not work | |
| # self.tokenizer.add_regex_handler(["!?"], "[!]+[?]+[!?]+", True, return_value=" !? ") | |
| self.split("!?") | |
| self.split("!?!") | |
| self.split("!!?") | |
| self.split("!??") | |
| self.split("?!!") | |
| self.split("?!?") | |
| self.split("??!") | |
| for x in string.ascii_letters: | |
| self.keep(" " + x + ".") | |
| # for x in string.ascii_letters: | |
| # self.tokenizer.add("\n" + x, "\n" + x) | |
| for s in ":;,": | |
| self.split(s, "Splits on '{}' (punctuation)") | |
| # quotes (make sure we add all the exeptions) | |
| self.split("'") | |
| self.split('"') | |
| def keep(self, x, reason=None): | |
| """ Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """ | |
| self.tokenizer.add(x, x) | |
| self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace("x", repr(x)).rstrip() | |
| def split(self, x, reason=None): | |
| """ Whenever it finds x, it will surround it by whitespace, thus creating a token. """ | |
| self.tokenizer.add(x, " {} ".format(x)) | |
| self.explain_dict[x] = ( | |
| reason or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip() | |
| ) | |
| def drop(self, x, reason=None): | |
| """ Whenever it finds x, it will remove it but add a split.""" | |
| self.tokenizer.add(x, " ") | |
| self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace("x", repr(x)).rstrip() | |
| def strip(self, x, reason=None): | |
| """ Whenever it finds x, it will remove it without splitting. """ | |
| self.tokenizer.add(x, "") | |
| self.explain_dict[x] = ( | |
| reason or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip() | |
| ) | |
| def add(self, x, y, reason): | |
| self.tokenizer.add(x, y) | |
| self.explain_dict[x] = reason | |
| def explain(self, char_or_chars): | |
| keys = [x for x in self.tokenizer._root_dict if char_or_chars in x] | |
| if not keys: | |
| return { | |
| "explanation": "No explanation, meaning there is nothing specified for the input" | |
| } | |
| return [ | |
| {"from": x, "to": self.tokenizer._root_dict[x], "explanation": self.explain_dict[x]} | |
| for x in keys | |
| ] | |
| def remove(self, x): | |
| if x in self.tokenizer: | |
| self.tokenizer.remove(x) | |
| del self.explain_dict[x] | |
| def add_currencies(self): | |
| for currency in self.currencies: | |
| self.split(currency) | |
| for num in "0123456789": | |
| # to prevent the . and , from being treated as punct | |
| for punc in ",.": | |
| s = "{currency}{num}{punc}".format(currency=currency, num=num, punc=punc) | |
| r = " {currency} {num}{punc}".format(currency=currency, num=num, punc=punc) | |
| self.add(s, r, "protecting currency from being seen as a number.") | |
| def word_tokenize(self, z, return_entities=False, to_lower=False): | |
| if return_entities: | |
| a, b = self.tokenizer.replace(" " + z, return_entities=True) | |
| return a.split(), b | |
| res = self.tokenizer.replace(" " + z).split() | |
| if to_lower: | |
| res = [x.lower() for x in res] | |
| return res | |
| def word_newlined_tokenize(self, z): | |
| sentences = self.sent_tokenize(z) | |
| return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1] | |
| def sent_tokenize(self, z): | |
| return [x.split() for x in self.tokenizer.replace(z).split("\n") if x.strip()] | |
| t = Tokenizer(handle_http=True, handle_domains=False) | |
| word_tokenize = t.word_tokenize | |
| sent_tokenize = t.sent_tokenize | 
