Yurii Paniv
Fix formatter
17873e4
from num2words import num2words
import re
def number_form(number):
if number[-1] == "1":
return 0
elif number[-1] in ("2", "3", "4"):
return 1
else:
return 2
CURRENCY = {
"USD": ("долар", "долари", "доларів"),
"UAH": ("гривня", "гривні", "гривень"),
"EUR": ("євро", "євро", "євро"),
}
def replace_currency_with_words(text, currency, num_form):
if currency == "USD":
text = text.replace("$", CURRENCY[currency][num_form])
if currency == "UAH":
text = text.replace("₴", CURRENCY[currency][num_form])
if currency == "EUR":
text = text.replace("€", CURRENCY[currency][num_form])
return text
def find_any_char(text: str, find: str, start: int):
result = -1
for c in find:
index = text.find(c, start)
if (index >= 0) and (result > index or result == -1):
result = index
return result
# Have to check if I can use https://github.com/lang-uk/tokenize-uk
def simple_tokenizer(text: str):
start = 0
index = find_any_char(text, " ,", start)
while index >= 0:
word = text[start:index]
yield word
separator = text[index]
yield separator
start = index + 1
index = find_any_char(text, " ,", start)
yield text[start:]
def preprocess_text(text):
text = text.lower()
# currencies
if "$" in text:
currency = "USD"
gender = "masculine"
elif "₴" in text:
currency = "UAH"
gender = "feminine"
elif "€" in text:
currency = "EUR"
gender = "masculine"
else:
currency = ""
gender = "masculine"
num_form = 0
# replace apostrophe
text = text.replace("`", "'")
text = text.replace("ʼ", "'")
text = text.replace("…", "...")
symbols = {
"”": '"',
"“": '"',
"’": '"',
"‘": '"',
"«": '"',
"»": '"',
"–": "-",
"—": "-",
"―": "-",
}
for symbol, value in symbols.items():
text = text.replace(symbol, value)
# numbers
text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
def detect_num_and_convert(word):
numbers = "0123456789"
splits = ",."
currencies = "$₴€"
result = []
nonlocal num_form
parts = word.split("-") # for handling complex words
for part in parts:
is_number = all(map(lambda x: x in numbers, part)) or (
any(map(lambda x: x in numbers, part))
and any(map(lambda x: x in splits, part))
)
is_currency = any(map(lambda x: x in currencies, part)) and any(
map(lambda x: x in numbers, part)
) # contains both number and currency symbol
if is_number or is_currency:
try:
if is_currency:
cleaned_part = part
for part_currency in currencies:
if cleaned_part[0] == part_currency:
cleaned_part = cleaned_part[1:] + " " + part_currency
else:
cleaned_part = cleaned_part.replace(
part_currency, f" {part_currency} "
).strip() # TODO: replace with regex
part = " ".join(
[
detect_num_and_convert(part_word)
for part_word in cleaned_part.split(" ")
]
)
ends_with_dot = part.endswith(".") # ugly
ends_with_comma = part.endswith(",")
if ends_with_comma or ends_with_dot:
part = part[:-1]
part = " ".join(
[
detect_num_and_convert(part_word)
for part_word in part.split(" ")
]
) + ("." if ends_with_dot else ",")
num_form = number_form(part)
result.append(num2words(part.strip(), lang="uk", gender=gender))
except:
result.append(part)
else:
result.append(part)
return "-".join(result)
# print([detect_num_and_convert(word) for word in simple_tokenizer(text)])
text = "".join([detect_num_and_convert(word) for word in simple_tokenizer(text)])
text = replace_currency_with_words(text, currency, num_form)
# fallback numbers
text = text.replace("1", "один ")
text = text.replace("2", "два ")
text = text.replace("3", "три ")
text = text.replace("4", "чотири ")
text = text.replace("5", "п'ять ")
text = text.replace("6", "шість ")
text = text.replace("7", "сім ")
text = text.replace("8", "вісім ")
text = text.replace("9", "дев'ять ")
text = text.replace("0", "нуль ")
# speak english alphabet using brute force transliteration
english = {
"qu": "кв",
"ch": "ч",
"sh": "ш",
"шч": "щ", # after previous cases
"ph": "ф",
"kh": "х",
"yo": "йо",
"yu": "ю",
"ya": "я",
"ye": "є",
"yi": "ї",
"zh": "ж",
"ts": "ц",
"th": "т",
"a": "а",
"b": "б",
"c": "ц",
"d": "д",
"e": "е",
"f": "ф",
"g": "ґ",
"h": "г",
"i": "і",
"j": "дж",
"k": "к",
"l": "л",
"m": "м",
"n": "н",
"o": "о",
"p": "п",
"q": "кв",
"r": "р",
"s": "с",
"t": "т",
"u": "ю",
"v": "в",
"w": "в",
"x": "кс",
"y": "і",
"z": "з",
}
for english_char, english_value in english.items():
# uppercase
text = text.replace(english_char.upper(), english_value.upper())
text = text.replace(english_char, english_value)
return text