File size: 4,198 Bytes
21d29cb
 
09f9c26
21d29cb
 
 
 
 
 
 
 
 
 
 
 
 
09f9c26
 
21d29cb
 
 
 
 
 
 
 
 
31bf2aa
 
 
 
 
 
 
 
 
21d29cb
09f9c26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21d29cb
 
09f9c26
21d29cb
 
 
 
 
09f9c26
21d29cb
 
 
 
 
09f9c26
 
 
21d29cb
 
 
 
 
 
09f9c26
31bf2aa
09f9c26
 
21d29cb
 
 
09f9c26
21d29cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c36ebf7
09f9c26
 
 
 
31bf2aa
09f9c26
31bf2aa
09f9c26
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import hazm
import re
import string

from regexes.currency import CURRENCY_REGEX
from regexes.email import EMAIL_REGEX
from regexes.latin import LATIN_REGEX
from regexes.latin import LATIN_REGEX, LATIN_WITH_SPECIAL_REGEX
from regexes.number import NUMBERS_REGEX
from regexes.phone import PHONE_REGEX
from regexes.quote import DOUBLE_QUOTE_REGEX, SINGLE_QUOTE_REGEX
from regexes.url import URL_REGEX
from regexes.persian import PERSIAN_REGEX
from regexes.punk import PUNK_REGEX
import dictionary

allowed_char = string.ascii_letters + string.digits + ':/@_-. '


def make_trans(list_a, list_b):
    return dict((ord(a), b) for a, b in zip(list_a, list_b))


def multiple_replace(text, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))


def remove_adv_by_tag_name(text, tag_name):
    found = text.find(tag_name)

    if found > 0:
        text = text[:found]

    return text


def clean_url(text):
    # removing html tags
    text = re.sub('<.*?>', '', text)

    # removing normal(without space urls)
    text = re.sub(r'(?:(?:http|https):\/\/)?([-a-zA-Z0-9.]{2,256}\.[a-z]{2,4})\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?', "",
                  text)

    # removing urls that contains space
    result = ''
    for char in text:
        if char in allowed_char:
            result += char
    result = result.replace('  ', '')
    result = result.split(':')
    for phrase in result:
        p = phrase
        if '//' in p:
            if ('https :' + p) in text:
                text = text.replace('https :' + p, '')
            elif ('http :' + p) in text:
                text = text.replace('http :' + p, '')
        elif '@' in p:
            if p in text:
                text = text.replace(p, '')

    return text


ar2fa_digits = make_trans("٠١٢٣٤٥٦٧٨٩٪", "۰۱۲۳۴۵۶۷۸۹٪")
fa2en_digits = make_trans("۰۱۲۳۴۵۶۷۸۹٪", "0123456789%")
normalizer = hazm.Normalizer(persian_numbers=True, punctuation_spacing=False)


def normalize(text, zwnj="\u200c", tokenized=False):
    text = text.replace("\n", " ").replace("\t", " ")
    text = re.sub(r"\u200c+", "\u200c", text)
    text = text.replace('ـ', '')
    text = normalizer.normalize(text)

    if len(dictionary.characters) > 0:
        text = multiple_replace(text, dictionary.characters)

    if len(dictionary.words_map) > 0:
        text = multiple_replace(text, dictionary.words_map)

    text = text.translate(ar2fa_digits)
    text = text.translate(fa2en_digits)

    text = SINGLE_QUOTE_REGEX.sub("'", text)
    text = DOUBLE_QUOTE_REGEX.sub('"', text)
    text = CURRENCY_REGEX.sub(r" \1 ", text)
    text = clean_url(text)
    text = remove_adv_by_tag_name(text, tag_name="برچسب ها :")
    text = URL_REGEX.sub(" ", text)
    text = EMAIL_REGEX.sub(" ", text)
    text = PHONE_REGEX.sub(r" \1 ", text)
    text = NUMBERS_REGEX.sub(r" \1 ", text)
    text = LATIN_REGEX.sub(r" \1 ", text)
    # text = PUNK_REGEX.sub(r" \1 ", text)  # must be remained the same!

    # Allow only english and persian characters
    text = re.sub(PERSIAN_REGEX, " ", text)

    text = text.replace(f" {zwnj} ", f"{zwnj}")
    text = text.replace(f"{zwnj} ", f"{zwnj}")
    text = text.replace(f" {zwnj}", f"{zwnj}")

    if len(dictionary.special_tokens) > 0:
        text = multiple_replace(text, dictionary.special_tokens)

    tokens = []
    for token in text.split():
        token = token.strip()
        if token:
            if token.startswith(zwnj) and token.endswith(zwnj):
                token = token[1:-1]
            if token.startswith(zwnj):
                token = token[1:]
            elif token.endswith(zwnj):
                token = token[:-1]
            else:
                token = token

            tokens.append(token)

    if tokenized:
        return tokens

    return " ".join(tokens)



if __name__ == '__main__':
    import textwrap

    # input_text = " «هفتاد سی» "
    # input_text = normalize(input_text)
    # input_text = DOUBLE_QUOTE_REGEX.sub('"', input_text)
    # print(textwrap.fill(input_text))
    # print(normalize(input_text, tokenized=True))