import hazm import re import string from regexes.currency import CURRENCY_REGEX from regexes.email import EMAIL_REGEX from regexes.latin import LATIN_REGEX from regexes.latin import LATIN_REGEX, LATIN_WITH_SPECIAL_REGEX from regexes.number import NUMBERS_REGEX from regexes.phone import PHONE_REGEX from regexes.quote import DOUBLE_QUOTE_REGEX, SINGLE_QUOTE_REGEX from regexes.url import URL_REGEX from regexes.persian import PERSIAN_REGEX from regexes.punk import PUNK_REGEX import dictionary allowed_char = string.ascii_letters + string.digits + ':/@_-. ' def make_trans(list_a, list_b): return dict((ord(a), b) for a, b in zip(list_a, list_b)) def multiple_replace(text, chars_to_mapping): pattern = "|".join(map(re.escape, chars_to_mapping.keys())) return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text)) def remove_adv_by_tag_name(text, tag_name): found = text.find(tag_name) if found > 0: text = text[:found] return text def clean_url(text): # removing html tags text = re.sub('<.*?>', '', text) # removing normal(without space urls) text = re.sub(r'(?:(?:http|https):\/\/)?([-a-zA-Z0-9.]{2,256}\.[a-z]{2,4})\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?', "", text) # removing urls that contains space result = '' for char in text: if char in allowed_char: result += char result = result.replace(' ', '') result = result.split(':') for phrase in result: p = phrase if '//' in p: if ('https :' + p) in text: text = text.replace('https :' + p, '') elif ('http :' + p) in text: text = text.replace('http :' + p, '') elif '@' in p: if p in text: text = text.replace(p, '') return text ar2fa_digits = make_trans("٠١٢٣٤٥٦٧٨٩٪", "۰۱۲۳۴۵۶۷۸۹٪") fa2en_digits = make_trans("۰۱۲۳۴۵۶۷۸۹٪", "0123456789%") normalizer = hazm.Normalizer(persian_numbers=True, punctuation_spacing=False) def normalize(text, zwnj="\u200c", tokenized=False): text = text.replace("\n", " ").replace("\t", " ") text = re.sub(r"\u200c+", "\u200c", text) text = text.replace('ـ', '') text = normalizer.normalize(text) if len(dictionary.characters) > 0: text = multiple_replace(text, dictionary.characters) if len(dictionary.words_map) > 0: text = multiple_replace(text, dictionary.words_map) text = text.translate(ar2fa_digits) text = text.translate(fa2en_digits) text = SINGLE_QUOTE_REGEX.sub("'", text) text = DOUBLE_QUOTE_REGEX.sub('"', text) text = CURRENCY_REGEX.sub(r" \1 ", text) text = clean_url(text) text = remove_adv_by_tag_name(text, tag_name="برچسب ها :") text = URL_REGEX.sub(" ", text) text = EMAIL_REGEX.sub(" ", text) text = PHONE_REGEX.sub(r" \1 ", text) text = NUMBERS_REGEX.sub(r" \1 ", text) text = LATIN_REGEX.sub(r" \1 ", text) # text = PUNK_REGEX.sub(r" \1 ", text) # must be remained the same! # Allow only english and persian characters text = re.sub(PERSIAN_REGEX, " ", text) text = text.replace(f" {zwnj} ", f"{zwnj}") text = text.replace(f"{zwnj} ", f"{zwnj}") text = text.replace(f" {zwnj}", f"{zwnj}") if len(dictionary.special_tokens) > 0: text = multiple_replace(text, dictionary.special_tokens) tokens = [] for token in text.split(): token = token.strip() if token: if token.startswith(zwnj) and token.endswith(zwnj): token = token[1:-1] if token.startswith(zwnj): token = token[1:] elif token.endswith(zwnj): token = token[:-1] else: token = token tokens.append(token) if tokenized: return tokens return " ".join(tokens) if __name__ == '__main__': import textwrap # input_text = " «هفتاد سی» " # input_text = normalize(input_text) # input_text = DOUBLE_QUOTE_REGEX.sub('"', input_text) # print(textwrap.fill(input_text)) # print(normalize(input_text, tokenized=True))