Spaces:

robinhad
/

ukrainian-tts

Running

App Files Files Community

Yurii Paniv commited on Aug 16, 2023

Commit

953871c

•

1 Parent(s): 85f14d6

Add cases for currencies

Browse files

Files changed (2) hide show

tests/test_formatter.py +4 -1
ukrainian_tts/formatter.py +42 -12

tests/test_formatter.py CHANGED Viewed

@@ -19,7 +19,10 @@ import pytest
             "11100000001 доларів державного боргу.",
             "одинадцять мільярдів сто мільйонів один доларів державного боргу.",
         ),
-        # this is wrong case, should be "це дев'ятнадцяти-річне вино."
         # Implementing this, require to have proper parsing of words into the token stream
         # which reqiure reworking of current approach.
         ("це 19-річне вино.", "це дев'ятнадцять-річне вино."),

             "11100000001 доларів державного боргу.",
             "одинадцять мільярдів сто мільйонів один доларів державного боргу.",
         ),
+        ("10000$, 15000 корупціонерів", "десять тисяч доларів , п'ятнадцять тисяч корупціонерів"), # TODO: fix space before comma
+        ("$10000, 15000 корупціонерів", "доларів десять тисяч, п'ятнадцять тисяч корупціонерів"), # fix order
+        ("10000$ у еквіваленті борщових заправок", "десять тисяч доларів у еквіваленті борщових заправок"),
+        # this is wrong case, should be "це дев'ятнадцятирічне вино."
         # Implementing this, require to have proper parsing of words into the token stream
         # which reqiure reworking of current approach.
         ("це 19-річне вино.", "це дев'ятнадцять-річне вино."),

ukrainian_tts/formatter.py CHANGED Viewed

@@ -18,6 +18,17 @@ CURRENCY = {
 }
 def preprocess_text(text):
     text = text.lower()
     # currencies
@@ -57,17 +68,33 @@ def preprocess_text(text):
     text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
     def detect_num_and_convert(word):
-        numbers = "0123456789,."
         result = []
         nonlocal num_form
         parts = word.split("-")  # for handling complex words
         for part in parts:
-            is_number = all(map(lambda x: x in numbers, part))
-            if is_number:
                 try:
                     num_form = number_form(part)
-                    print("-" + part + "-" + str(num_form))
-                    result.append(num2words(part, lang="uk", gender=gender))
                 except:
                     result.append(part)
             else:
@@ -76,14 +103,8 @@ def preprocess_text(text):
     # print([detect_num_and_convert(word) for word in text.split(" ")])
     text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
-    if currency == "USD":
-        text = text.replace("$", CURRENCY[currency][num_form])
-    if currency == "UAH":
-        text = text.replace("₴", CURRENCY[currency][num_form])
-    if currency == "EUR":
-        text = text.replace("€", CURRENCY[currency][num_form])
     # fallback numbers
     text = text.replace("1", "один ")
@@ -101,8 +122,17 @@ def preprocess_text(text):
         "qu": "кв",
         "ch": "ч",
         "sh": "ш",
         "ph": "ф",
         "kh": "х",
         "a": "а",
         "b": "б",
         "c": "ц",

 }
+def replace_currency_with_words(text, currency, num_form):
+    if currency == "USD":
+        text = text.replace("$", CURRENCY[currency][num_form])
+    if currency == "UAH":
+        text = text.replace("₴", CURRENCY[currency][num_form])
+    if currency == "EUR":
+        text = text.replace("€", CURRENCY[currency][num_form])
+    return text
 def preprocess_text(text):
     text = text.lower()
     # currencies
     text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
     def detect_num_and_convert(word):
+        numbers = "0123456789"
+        splits = ",."
+        currencies = "$₴€"
         result = []
         nonlocal num_form
         parts = word.split("-")  # for handling complex words
         for part in parts:
+            is_number = all(map(lambda x: x in numbers, part)) or (any(map(lambda x: x in numbers, part)) and any(map(lambda x: x in splits, part)))
+            is_currency = any(map(lambda x: x in currencies, part)) and any(map(lambda x: x in numbers, part)) # contains both number and currency symbol
+            if is_number or is_currency:
                 try:
+                    if is_currency:
+                        cleaned_part = part
+                        for part_currency in currencies:
+                            cleaned_part = cleaned_part.replace(part_currency, f" {part_currency} ").strip() # TODO: replace with regex
+                        part = " ".join([detect_num_and_convert(part_word) for part_word in cleaned_part.split(" ")])
+                    ends_with_dot = part.endswith(".") # ugly
+                    ends_with_comma = part.endswith(",")
+                    if ends_with_comma or ends_with_dot:
+                        part = part[:-1]
+                        part = " ".join([detect_num_and_convert(part_word) for part_word in part.split(" ")]) + ("." if ends_with_dot else ",")
                     num_form = number_form(part)
+                    result.append(num2words(part.strip(), lang="uk", gender=gender))
                 except:
                     result.append(part)
             else:
     # print([detect_num_and_convert(word) for word in text.split(" ")])
     text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
+    text = replace_currency_with_words(text, currency, num_form)
     # fallback numbers
     text = text.replace("1", "один ")
         "qu": "кв",
         "ch": "ч",
         "sh": "ш",
+        "шч": "щ", # after previous cases
         "ph": "ф",
         "kh": "х",
+        "yo": "йо",
+        "yu": "ю",
+        "ya": "я",
+        "ye": "є",
+        "yi": "ї",
+        "zh": "ж",
+        "ts": "ц",
+        "th": "т",
         "a": "а",
         "b": "б",
         "c": "ц",