File size: 4,198 Bytes
21d29cb 09f9c26 21d29cb 09f9c26 21d29cb 31bf2aa 21d29cb 09f9c26 21d29cb 09f9c26 21d29cb 09f9c26 21d29cb 09f9c26 21d29cb 09f9c26 31bf2aa 09f9c26 21d29cb 09f9c26 21d29cb c36ebf7 09f9c26 31bf2aa 09f9c26 31bf2aa 09f9c26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import hazm
import re
import string
from regexes.currency import CURRENCY_REGEX
from regexes.email import EMAIL_REGEX
from regexes.latin import LATIN_REGEX
from regexes.latin import LATIN_REGEX, LATIN_WITH_SPECIAL_REGEX
from regexes.number import NUMBERS_REGEX
from regexes.phone import PHONE_REGEX
from regexes.quote import DOUBLE_QUOTE_REGEX, SINGLE_QUOTE_REGEX
from regexes.url import URL_REGEX
from regexes.persian import PERSIAN_REGEX
from regexes.punk import PUNK_REGEX
import dictionary
allowed_char = string.ascii_letters + string.digits + ':/@_-. '
def make_trans(list_a, list_b):
return dict((ord(a), b) for a, b in zip(list_a, list_b))
def multiple_replace(text, chars_to_mapping):
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
def remove_adv_by_tag_name(text, tag_name):
found = text.find(tag_name)
if found > 0:
text = text[:found]
return text
def clean_url(text):
# removing html tags
text = re.sub('<.*?>', '', text)
# removing normal(without space urls)
text = re.sub(r'(?:(?:http|https):\/\/)?([-a-zA-Z0-9.]{2,256}\.[a-z]{2,4})\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?', "",
text)
# removing urls that contains space
result = ''
for char in text:
if char in allowed_char:
result += char
result = result.replace(' ', '')
result = result.split(':')
for phrase in result:
p = phrase
if '//' in p:
if ('https :' + p) in text:
text = text.replace('https :' + p, '')
elif ('http :' + p) in text:
text = text.replace('http :' + p, '')
elif '@' in p:
if p in text:
text = text.replace(p, '')
return text
ar2fa_digits = make_trans("٠١٢٣٤٥٦٧٨٩٪", "۰۱۲۳۴۵۶۷۸۹٪")
fa2en_digits = make_trans("۰۱۲۳۴۵۶۷۸۹٪", "0123456789%")
normalizer = hazm.Normalizer(persian_numbers=True, punctuation_spacing=False)
def normalize(text, zwnj="\u200c", tokenized=False):
text = text.replace("\n", " ").replace("\t", " ")
text = re.sub(r"\u200c+", "\u200c", text)
text = text.replace('ـ', '')
text = normalizer.normalize(text)
if len(dictionary.characters) > 0:
text = multiple_replace(text, dictionary.characters)
if len(dictionary.words_map) > 0:
text = multiple_replace(text, dictionary.words_map)
text = text.translate(ar2fa_digits)
text = text.translate(fa2en_digits)
text = SINGLE_QUOTE_REGEX.sub("'", text)
text = DOUBLE_QUOTE_REGEX.sub('"', text)
text = CURRENCY_REGEX.sub(r" \1 ", text)
text = clean_url(text)
text = remove_adv_by_tag_name(text, tag_name="برچسب ها :")
text = URL_REGEX.sub(" ", text)
text = EMAIL_REGEX.sub(" ", text)
text = PHONE_REGEX.sub(r" \1 ", text)
text = NUMBERS_REGEX.sub(r" \1 ", text)
text = LATIN_REGEX.sub(r" \1 ", text)
# text = PUNK_REGEX.sub(r" \1 ", text) # must be remained the same!
# Allow only english and persian characters
text = re.sub(PERSIAN_REGEX, " ", text)
text = text.replace(f" {zwnj} ", f"{zwnj}")
text = text.replace(f"{zwnj} ", f"{zwnj}")
text = text.replace(f" {zwnj}", f"{zwnj}")
if len(dictionary.special_tokens) > 0:
text = multiple_replace(text, dictionary.special_tokens)
tokens = []
for token in text.split():
token = token.strip()
if token:
if token.startswith(zwnj) and token.endswith(zwnj):
token = token[1:-1]
if token.startswith(zwnj):
token = token[1:]
elif token.endswith(zwnj):
token = token[:-1]
else:
token = token
tokens.append(token)
if tokenized:
return tokens
return " ".join(tokens)
if __name__ == '__main__':
import textwrap
# input_text = " «هفتاد سی» "
# input_text = normalize(input_text)
# input_text = DOUBLE_QUOTE_REGEX.sub('"', input_text)
# print(textwrap.fill(input_text))
# print(normalize(input_text, tokenized=True))
|