File size: 1,834 Bytes
14f41dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from nltk.tokenize import wordpunct_tokenize as word_tokenize
from nltk.tokenize import sent_tokenize
import re
import six
import textwrap
_whitelist = r"[0-9a-z\,\.\/\<\>]+"
_regex = "0-9a-z\,\.\/\<\>"
def filter_by_lang_regex(text, ratio=0.7, regex="0-9a-z\,\.\/\<\>"):
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text), flags=re.IGNORECASE).replace(" ", "")
text = text.replace(" ", "")
return (len(candidate_text) / len(text)) > ratio
def filter_by_num_tokens(text, gt=64):
return len(word_tokenize(text)) > gt
def filter_by_num_sents(text, gt=2):
return len(sent_tokenize(text)) > gt
def filter_by_steps(text):
return re.search('(step|mix all)', text, re.IGNORECASE) is not None
def filter_by_length(text, gt=40):
return len(text) > gt
def filter_by_item(item_list, gt=4):
return len(item_list) > gt
def chars_to_preserve(sentence, whitelist):
try:
tokenized = re.findall(whitelist, sentence, re.IGNORECASE)
return " ".join(tokenized)
except Exception as error:
print(
textwrap.dedent(
f"""
Bad characters range {whitelist},
{error}
"""
)
)
raise
def normalizer(text, whitelist=r"[0-9a-z\,\.\/\<\>]+", do_lowercase=False):
if do_lowercase:
text = text.lower()
text = chars_to_preserve(text, whitelist=whitelist)
text = " ".join([word.strip() for word in text.split() if word.strip()])
text = text.strip()
return text
# _text = "Crust, Peanut Butter}Melt <sep> 1/2Butter, 2 c. Eggs, Filling, Semi- Sweet Chocolate Chips, Milk, Butter, " \
# "Frosting"
# out = normalizer(_text)
# print(out)
#
# _text = "step ... "
# print(re.search('(step|mix all)', _text, re.IGNORECASE) != None)
|