Spaces:

robinhad
/

ukrainian-tts

Running

File size: 3,549 Bytes

349b2ad
 
9caae98
 
349b2ad
77f184e
9caae98
349b2ad
 
 
 
c7de0f6
 
 
 
77f184e
349b2ad
 
 
e5a3778
77f184e
e5a3778
 
 
 
 
 
 
 
 
 
349b2ad
77f184e
349b2ad
 
 
 
 
 
 
 
 
 
 
 
 
c010ef4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7f3351
c010ef4
 
 
 
77f184e
c010ef4
 
12ee3cc
 
 
349b2ad
c7de0f6
c010ef4
 
 
e5a3778
77f184e
 
 
 
 
 
 
 
 
 
 
e5a3778
77f184e

import num2words
import re
from stress import sentence_to_stress, stress_dict
from stress_with_model import stress_with_model


def preprocess_text(text, use_autostress_model=False):
    # currencies
    text = text.replace("$", "долар")
    text = text.replace("₴", "гривня")
    text = text.replace("€", "євро")
    # replace apostrophe
    text = text.replace("`", "'")
    text = text.replace("ʼ", "'")
    # numbers
    text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)

    def detect_num_and_convert(word):
        numbers = "0123456789,."
        result = []
        parts = word.split("-")  # for handling complex words
        for part in parts:
            is_number = all(map(lambda x: x in numbers, part))
            if is_number:
                try:
                    result.append(num2words.num2words(part, lang="uk"))
                except:
                    result.append(part)
            else:
                result.append(part)
        return "-".join(result)

    # print([detect_num_and_convert(word) for word in text.split(" ")])
    text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])

    # fallback numbers
    text = text.replace("1", "один ")
    text = text.replace("2", "два ")
    text = text.replace("3", "три ")
    text = text.replace("4", "чотири ")
    text = text.replace("5", "п'ять ")
    text = text.replace("6", "шість ")
    text = text.replace("7", "сім ")
    text = text.replace("8", "вісім ")
    text = text.replace("9", "дев'ять ")
    text = text.replace("0", "нуль ")
    # speak english alphabet using brute force transliteration
    english = {
        "a": "а",
        "b": "б",
        "c": "ц",
        "d": "д",
        "e": "е",
        "f": "ф",
        "g": "ґ",
        "h": "г",
        "i": "і",
        "j": "дж",
        "k": "к",
        "l": "л",
        "m": "м",
        "n": "н",
        "o": "о",
        "p": "п",
        "q": "кв",
        "r": "р",
        "s": "с",
        "t": "т",
        "u": "ю",
        "v": "в",
        "w": "в",
        "x": "кс",
        "y": "і",
        "z": "з",
    }
    for english_char in english.keys():
        # uppercase
        text = text.replace(english_char.upper(), english[english_char].upper())
        text = text.replace(english_char, english[english_char])

    text = sentence_to_stress(
        text, stress_with_model if use_autostress_model else stress_dict
    )

    return text


if __name__ == "__main__":
    assert preprocess_text("Quality of life update") == "КВюаліті оф ліфе юпдате"
    assert (
        preprocess_text("Він украв 20000000 $") == "Він украв двадцять мільйонів долар"
    )
    assert (
        preprocess_text("111 000 000 000 доларів державного боргу.")
        == "сто одинадцять мільярдів доларів державного боргу."
    )
    assert (
        preprocess_text("11100000001 доларів державного боргу.")
        == "одинадцять мільярдів сто мільйонів одна доларів державного боргу."
    )
    assert preprocess_text("це 19-річне вино.") == "це дев'ятнадцять-річне вино."
    assert (
        preprocess_text("10-30-40-50-5-9-5")
        == "десять-тридцять-сорок-п'ятдесят-п'ять-дев'ять-п'ять"
    )