from hazm import word_tokenize from hazm import sent_tokenize import re import six from normalizer import normalize persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c" def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"): candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "") text = text.replace(" ", "") return (len(candidate_text) / len(text)) > ratio def filter_by_num_tokens(text, gt=64): return len(word_tokenize(text)) > gt def filter_by_num_sents(text, gt=2): return len(sent_tokenize(text)) > gt def filter_by_adv(text, ratio=50): comma = text.split(",") colon = re.findall(r"""(?:([^\W]+):([^\W]+))""", text) virgool = text.split("،") length_add = len(comma) + len(colon) + len(virgool) return length_add < ratio def normalizer(text, do_lowercase=False): text = normalize(text) if do_lowercase: text = text.lower() return text