|
from hazm import word_tokenize |
|
from hazm import sent_tokenize |
|
import re |
|
import six |
|
|
|
from normalizer import normalize |
|
|
|
persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c" |
|
|
|
|
|
def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"): |
|
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "") |
|
text = text.replace(" ", "") |
|
|
|
return (len(candidate_text) / len(text)) > ratio |
|
|
|
|
|
def filter_by_num_tokens(text, gt=64): |
|
return len(word_tokenize(text)) > gt |
|
|
|
|
|
def filter_by_num_sents(text, gt=2): |
|
return len(sent_tokenize(text)) > gt |
|
|
|
|
|
def filter_by_adv(text, ratio=50): |
|
comma = text.split(",") |
|
colon = re.findall(r"""(?:([^\W]+):([^\W]+))""", text) |
|
virgool = text.split("،") |
|
length_add = len(comma) + len(colon) + len(virgool) |
|
|
|
return length_add < ratio |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalizer(example): |
|
example["text"] = normalize(example["text"]) |
|
return example |
|
|
|
|