File size: 6,318 Bytes
7b54849
349b2ad
 
cf5d96f
7b54849
 
 
 
 
 
 
 
cf5d96f
7b54849
cf5d96f
 
 
7b54849
77f184e
cf5d96f
953871c
 
 
 
 
 
 
 
 
 
 
17873e4
6180416
 
 
 
 
 
 
 
 
17873e4
6180416
 
 
 
17873e4
6180416
 
 
 
 
 
 
 
d663a5a
17873e4
30f2cde
 
349b2ad
7b54849
 
cf5d96f
7b54849
 
cf5d96f
7b54849
 
cf5d96f
7b54849
 
cf5d96f
7b54849
 
c7de0f6
 
 
c0a9b4d
305bf1a
c0a9b4d
305bf1a
 
 
 
 
 
c0a9b4d
 
 
 
86e7d18
 
c7de0f6
77f184e
349b2ad
 
953871c
 
 
e5a3778
7b54849
77f184e
e5a3778
d663a5a
 
 
 
 
 
 
953871c
e5a3778
953871c
 
 
d663a5a
6180416
 
 
 
 
 
d663a5a
 
 
 
 
 
 
 
 
953871c
 
 
d663a5a
 
 
 
 
 
953871c
7b54849
953871c
e5a3778
 
 
 
 
349b2ad
6180416
 
953871c
349b2ad
 
 
 
 
 
 
 
 
 
 
 
c010ef4
 
30f2cde
 
5fe2eb9
d663a5a
30f2cde
5fe2eb9
953871c
 
 
 
 
 
 
 
c010ef4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7f3351
c010ef4
 
7927694
c010ef4
86e7d18
 
c010ef4
2ffc7e7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
from num2words import num2words
import re


def number_form(number):
    if number[-1] == "1":
        return 0
    elif number[-1] in ("2", "3", "4"):
        return 1
    else:
        return 2


CURRENCY = {
    "USD": ("долар", "долари", "доларів"),
    "UAH": ("гривня", "гривні", "гривень"),
    "EUR": ("євро", "євро", "євро"),
}


def replace_currency_with_words(text, currency, num_form):
    if currency == "USD":
        text = text.replace("$", CURRENCY[currency][num_form])

    if currency == "UAH":
        text = text.replace("₴", CURRENCY[currency][num_form])

    if currency == "EUR":
        text = text.replace("€", CURRENCY[currency][num_form])
    return text


def find_any_char(text: str, find: str, start: int):
    result = -1
    for c in find:
        index = text.find(c, start)
        if (index >= 0) and (result > index or result == -1):
            result = index

    return result


# Have to check if I can use https://github.com/lang-uk/tokenize-uk
def simple_tokenizer(text: str):
    start = 0
    index = find_any_char(text, " ,", start)
    while index >= 0:
        word = text[start:index]
        yield word
        separator = text[index]
        yield separator
        start = index + 1
        index = find_any_char(text, " ,", start)

    yield text[start:]


def preprocess_text(text):
    text = text.lower()
    # currencies
    if "$" in text:
        currency = "USD"
        gender = "masculine"
    elif "₴" in text:
        currency = "UAH"
        gender = "feminine"
    elif "€" in text:
        currency = "EUR"
        gender = "masculine"
    else:
        currency = ""
        gender = "masculine"

    num_form = 0
    # replace apostrophe
    text = text.replace("`", "'")
    text = text.replace("ʼ", "'")
    text = text.replace("…", "...")

    symbols = {
        "”": '"',
        "“": '"',
        "’": '"',
        "‘": '"',
        "«": '"',
        "»": '"',
        "–": "-",
        "—": "-",
        "―": "-",
    }
    for symbol, value in symbols.items():
        text = text.replace(symbol, value)
    # numbers
    text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)

    def detect_num_and_convert(word):
        numbers = "0123456789"
        splits = ",."
        currencies = "$₴€"
        result = []
        nonlocal num_form
        parts = word.split("-")  # for handling complex words
        for part in parts:
            is_number = all(map(lambda x: x in numbers, part)) or (
                any(map(lambda x: x in numbers, part))
                and any(map(lambda x: x in splits, part))
            )
            is_currency = any(map(lambda x: x in currencies, part)) and any(
                map(lambda x: x in numbers, part)
            )  # contains both number and currency symbol
            if is_number or is_currency:
                try:
                    if is_currency:
                        cleaned_part = part

                        for part_currency in currencies:
                            if cleaned_part[0] == part_currency:
                                cleaned_part = cleaned_part[1:] + " " + part_currency
                            else:
                                cleaned_part = cleaned_part.replace(
                                    part_currency, f" {part_currency} "
                                ).strip()  # TODO: replace with regex

                        part = " ".join(
                            [
                                detect_num_and_convert(part_word)
                                for part_word in cleaned_part.split(" ")
                            ]
                        )

                    ends_with_dot = part.endswith(".")  # ugly
                    ends_with_comma = part.endswith(",")
                    if ends_with_comma or ends_with_dot:
                        part = part[:-1]
                        part = " ".join(
                            [
                                detect_num_and_convert(part_word)
                                for part_word in part.split(" ")
                            ]
                        ) + ("." if ends_with_dot else ",")

                    num_form = number_form(part)
                    result.append(num2words(part.strip(), lang="uk", gender=gender))
                except:
                    result.append(part)
            else:
                result.append(part)
        return "-".join(result)

    # print([detect_num_and_convert(word) for word in simple_tokenizer(text)])
    text = "".join([detect_num_and_convert(word) for word in simple_tokenizer(text)])
    text = replace_currency_with_words(text, currency, num_form)

    # fallback numbers
    text = text.replace("1", "один ")
    text = text.replace("2", "два ")
    text = text.replace("3", "три ")
    text = text.replace("4", "чотири ")
    text = text.replace("5", "п'ять ")
    text = text.replace("6", "шість ")
    text = text.replace("7", "сім ")
    text = text.replace("8", "вісім ")
    text = text.replace("9", "дев'ять ")
    text = text.replace("0", "нуль ")
    # speak english alphabet using brute force transliteration
    english = {
        "qu": "кв",
        "ch": "ч",
        "sh": "ш",
        "шч": "щ",  # after previous cases
        "ph": "ф",
        "kh": "х",
        "yo": "йо",
        "yu": "ю",
        "ya": "я",
        "ye": "є",
        "yi": "ї",
        "zh": "ж",
        "ts": "ц",
        "th": "т",
        "a": "а",
        "b": "б",
        "c": "ц",
        "d": "д",
        "e": "е",
        "f": "ф",
        "g": "ґ",
        "h": "г",
        "i": "і",
        "j": "дж",
        "k": "к",
        "l": "л",
        "m": "м",
        "n": "н",
        "o": "о",
        "p": "п",
        "q": "кв",
        "r": "р",
        "s": "с",
        "t": "т",
        "u": "ю",
        "v": "в",
        "w": "в",
        "x": "кс",
        "y": "і",
        "z": "з",
    }
    for english_char, english_value in english.items():
        # uppercase
        text = text.replace(english_char.upper(), english_value.upper())
        text = text.replace(english_char, english_value)

    return text