|
import string |
|
from textsearch import TextSearch |
|
from contractions import contractions_dict, leftovers_dict |
|
|
|
ABBREVS = ( |
|
"a.m.", |
|
"adm.", |
|
"bros.", |
|
"co.", |
|
"corp.", |
|
"d.c.", |
|
"dr.", |
|
"e.g.", |
|
"gen.", |
|
"gov.", |
|
"i.e.", |
|
"inc.", |
|
"jr.", |
|
"ltd.", |
|
"md.", |
|
"messrs.", |
|
"mo.", |
|
"mont.", |
|
"mr.", |
|
"mrs.", |
|
"ms.", |
|
"p.m.", |
|
"ph.d.", |
|
"rep.", |
|
"rev.", |
|
"sen.", |
|
"st.", |
|
"vs.", |
|
) |
|
|
|
|
|
class Tokenizer: |
|
def __init__( |
|
self, |
|
handle_http=False, |
|
handle_domains=False, |
|
numbers=True, |
|
combine_punctuation=True, |
|
eol="\n", |
|
currencies=("$",), |
|
protected_words=None, |
|
contractions=True, |
|
language="en", |
|
abbrevs=ABBREVS, |
|
): |
|
|
|
if language != "en" and contractions: |
|
raise ValueError("No contractions known for languages other than English.") |
|
self.contractions = contractions |
|
self.tokenizer = None |
|
self.handle_http = handle_http |
|
self.handle_domains = handle_domains |
|
self.combine_punctuation = combine_punctuation |
|
self.numbers = numbers |
|
self.eol = eol |
|
self.currencies = currencies or [] |
|
self.protected_words = protected_words or [] |
|
self.abbrevs = abbrevs |
|
self.explain_dict = {} |
|
self.setup() |
|
|
|
def setup(self): |
|
self.tokenizer = TextSearch("sensitive", "norm", set(), set()) |
|
self.add_base_cases() |
|
self.add_currencies() |
|
self.add_words(self.protected_words) |
|
if self.handle_http: |
|
self.tokenizer.add_http_handler(keep_result=True) |
|
for word in ["http://", "https://", "www."]: |
|
self.explain_dict[ |
|
word |
|
] = "regex: when it finds '{}' it will stop after it finds a space.".format(word) |
|
if self.handle_domains: |
|
self.add_domain_handler() |
|
if self.contractions: |
|
if self.contractions == True: |
|
self.contractions = {} |
|
self.contractions.update(contractions_dict) |
|
self.contractions.update(leftovers_dict) |
|
self.add_words(self.contractions) |
|
if self.abbrevs: |
|
self.add_words(self.abbrevs) |
|
|
|
def add_words(self, words): |
|
words = words.items() if isinstance(words, dict) else words |
|
if words and isinstance(words, (list, set, tuple)) and isinstance(words[0], str): |
|
words = [(x, x) for x in words] |
|
for x, y in words: |
|
REASON_AS_IS = "protected word: adds word as is, prevents splitting it." |
|
REASON_UPPER = "protected word: adds word uppercased, prevents splitting it." |
|
REASON_TITLE = "protected word: adds word titlecased, prevents splitting it." |
|
self.add(x, y, REASON_AS_IS) |
|
self.add(x.upper(), y.upper(), REASON_UPPER) |
|
if y: |
|
self.add(x[0].upper() + x[1:], y[0].upper() + y[1:], REASON_TITLE) |
|
|
|
def add_domain_handler(self): |
|
import re |
|
from tldextract.tldextract import TLD_EXTRACTOR |
|
|
|
valid_re = re.compile("^[a-zA-Z.]+$") |
|
tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)] |
|
|
|
for x in tlds: |
|
self.add(x, x, "Added by domain handler, keeps the token existing.") |
|
|
|
def add_base_cases(self): |
|
if self.numbers: |
|
for x in "0123456789": |
|
self.keep(x + ",") |
|
self.keep(x + ".") |
|
|
|
|
|
|
|
if self.combine_punctuation: |
|
|
|
R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence." |
|
for s in "!.?-": |
|
for i in range(2, 10): |
|
|
|
if i == 1 and s == "-": |
|
continue |
|
c = s * i |
|
e = s * 3 if i > 1 else s |
|
|
|
end = " \n" if i == 1 or s != "-" else " " |
|
self.add(c, " {}{}".format(e, end), R_COMBINE.format(c, e + end)) |
|
|
|
for i in range(2, 10): |
|
|
|
self.add("\n" * i, " \n ", "merges newlines") |
|
|
|
for s in "!.?-\n": |
|
self.add(s, " " + s + "\n", "Splits on '{}' and creating a new sentence.".format(s)) |
|
|
|
self.split("- ") |
|
|
|
self.split("...") |
|
|
|
|
|
|
|
|
|
self.split("!?") |
|
self.split("!?!") |
|
self.split("!!?") |
|
self.split("!??") |
|
self.split("?!!") |
|
self.split("?!?") |
|
self.split("??!") |
|
|
|
for x in string.ascii_letters: |
|
self.keep(" " + x + ".") |
|
|
|
|
|
|
|
|
|
for s in ":;,": |
|
self.split(s, "Splits on '{}' (punctuation)") |
|
|
|
|
|
self.split("'") |
|
self.split('"') |
|
|
|
def keep(self, x, reason=None): |
|
""" Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """ |
|
self.tokenizer.add(x, x) |
|
self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace("x", repr(x)).rstrip() |
|
|
|
def split(self, x, reason=None): |
|
""" Whenever it finds x, it will surround it by whitespace, thus creating a token. """ |
|
self.tokenizer.add(x, " {} ".format(x)) |
|
self.explain_dict[x] = ( |
|
reason or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip() |
|
) |
|
|
|
def drop(self, x, reason=None): |
|
""" Whenever it finds x, it will remove it but add a split.""" |
|
self.tokenizer.add(x, " ") |
|
self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace("x", repr(x)).rstrip() |
|
|
|
def strip(self, x, reason=None): |
|
""" Whenever it finds x, it will remove it without splitting. """ |
|
self.tokenizer.add(x, "") |
|
self.explain_dict[x] = ( |
|
reason or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip() |
|
) |
|
|
|
def add(self, x, y, reason): |
|
self.tokenizer.add(x, y) |
|
self.explain_dict[x] = reason |
|
|
|
def explain(self, char_or_chars): |
|
keys = [x for x in self.tokenizer._root_dict if char_or_chars in x] |
|
if not keys: |
|
return { |
|
"explanation": "No explanation, meaning there is nothing specified for the input" |
|
} |
|
return [ |
|
{"from": x, "to": self.tokenizer._root_dict[x], "explanation": self.explain_dict[x]} |
|
for x in keys |
|
] |
|
|
|
def remove(self, x): |
|
if x in self.tokenizer: |
|
self.tokenizer.remove(x) |
|
del self.explain_dict[x] |
|
|
|
def add_currencies(self): |
|
for currency in self.currencies: |
|
self.split(currency) |
|
|
|
for num in "0123456789": |
|
|
|
for punc in ",.": |
|
s = "{currency}{num}{punc}".format(currency=currency, num=num, punc=punc) |
|
r = " {currency} {num}{punc}".format(currency=currency, num=num, punc=punc) |
|
self.add(s, r, "protecting currency from being seen as a number.") |
|
|
|
def word_tokenize(self, z, return_entities=False, to_lower=False): |
|
if return_entities: |
|
a, b = self.tokenizer.replace(" " + z, return_entities=True) |
|
return a.split(), b |
|
res = self.tokenizer.replace(" " + z).split() |
|
if to_lower: |
|
res = [x.lower() for x in res] |
|
return res |
|
|
|
def word_newlined_tokenize(self, z): |
|
sentences = self.sent_tokenize(z) |
|
return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1] |
|
|
|
def sent_tokenize(self, z): |
|
return [x.split() for x in self.tokenizer.replace(z).split("\n") if x.strip()] |
|
|
|
|
|
t = Tokenizer(handle_http=True, handle_domains=False) |
|
word_tokenize = t.word_tokenize |
|
sent_tokenize = t.sent_tokenize |