Spaces:

robinhad
/

ukrainian-tts

Running

App Files Files Community

Yurii Paniv commited on Aug 16, 2023

Commit

d663a5a

•

1 Parent(s): 953871c

Fix formatter

Browse files

Files changed (2) hide show

tests/test_formatter.py +12 -3
ukrainian_tts/formatter.py +28 -10

tests/test_formatter.py CHANGED Viewed

@@ -19,9 +19,18 @@ import pytest
             "11100000001 доларів державного боргу.",
             "одинадцять мільярдів сто мільйонів один доларів державного боргу.",
         ),
-        ("10000$, 15000 корупціонерів", "десять тисяч доларів , п'ятнадцять тисяч корупціонерів"), # TODO: fix space before comma
-        ("$10000, 15000 корупціонерів", "доларів десять тисяч, п'ятнадцять тисяч корупціонерів"), # fix order
-        ("10000$ у еквіваленті борщових заправок", "десять тисяч доларів у еквіваленті борщових заправок"),
         # this is wrong case, should be "це дев'ятнадцятирічне вино."
         # Implementing this, require to have proper parsing of words into the token stream
         # which reqiure reworking of current approach.

             "11100000001 доларів державного боргу.",
             "одинадцять мільярдів сто мільйонів один доларів державного боргу.",
         ),
+        (
+            "10000$, 15000 корупціонерів",
+            "десять тисяч доларів , п'ятнадцять тисяч корупціонерів",
+        ),  # TODO: fix space before comma
+        (
+            "$10000, 15000 корупціонерів",
+            "доларів десять тисяч, п'ятнадцять тисяч корупціонерів",
+        ),  # fix order
+        (
+            "10000$ у еквіваленті борщових заправок",
+            "десять тисяч доларів у еквіваленті борщових заправок",
+        ),
         # this is wrong case, should be "це дев'ятнадцятирічне вино."
         # Implementing this, require to have proper parsing of words into the token stream
         # which reqiure reworking of current approach.

ukrainian_tts/formatter.py CHANGED Viewed

@@ -29,6 +29,7 @@ def replace_currency_with_words(text, currency, num_form):
         text = text.replace("€", CURRENCY[currency][num_form])
     return text
 def preprocess_text(text):
     text = text.lower()
     # currencies
@@ -75,23 +76,40 @@ def preprocess_text(text):
         nonlocal num_form
         parts = word.split("-")  # for handling complex words
         for part in parts:
-            is_number = all(map(lambda x: x in numbers, part)) or (any(map(lambda x: x in numbers, part)) and any(map(lambda x: x in splits, part)))
-            is_currency = any(map(lambda x: x in currencies, part)) and any(map(lambda x: x in numbers, part)) # contains both number and currency symbol
             if is_number or is_currency:
                 try:
                     if is_currency:
                         cleaned_part = part
-                        for part_currency in currencies:
-                            cleaned_part = cleaned_part.replace(part_currency, f" {part_currency} ").strip() # TODO: replace with regex
-                        part = " ".join([detect_num_and_convert(part_word) for part_word in cleaned_part.split(" ")])
-                    ends_with_dot = part.endswith(".") # ugly
                     ends_with_comma = part.endswith(",")
                     if ends_with_comma or ends_with_dot:
                         part = part[:-1]
-                        part = " ".join([detect_num_and_convert(part_word) for part_word in part.split(" ")]) + ("." if ends_with_dot else ",")
                     num_form = number_form(part)
                     result.append(num2words(part.strip(), lang="uk", gender=gender))
@@ -122,7 +140,7 @@ def preprocess_text(text):
         "qu": "кв",
         "ch": "ч",
         "sh": "ш",
-        "шч": "щ", # after previous cases
         "ph": "ф",
         "kh": "х",
         "yo": "йо",

         text = text.replace("€", CURRENCY[currency][num_form])
     return text
 def preprocess_text(text):
     text = text.lower()
     # currencies
         nonlocal num_form
         parts = word.split("-")  # for handling complex words
         for part in parts:
+            is_number = all(map(lambda x: x in numbers, part)) or (
+                any(map(lambda x: x in numbers, part))
+                and any(map(lambda x: x in splits, part))
+            )
+            is_currency = any(map(lambda x: x in currencies, part)) and any(
+                map(lambda x: x in numbers, part)
+            )  # contains both number and currency symbol
             if is_number or is_currency:
                 try:
                     if is_currency:
                         cleaned_part = part
+                        for part_currency in currencies:
+                            cleaned_part = cleaned_part.replace(
+                                part_currency, f" {part_currency} "
+                            ).strip()  # TODO: replace with regex
+                        part = " ".join(
+                            [
+                                detect_num_and_convert(part_word)
+                                for part_word in cleaned_part.split(" ")
+                            ]
+                        )
+                    ends_with_dot = part.endswith(".")  # ugly
                     ends_with_comma = part.endswith(",")
                     if ends_with_comma or ends_with_dot:
                         part = part[:-1]
+                        part = " ".join(
+                            [
+                                detect_num_and_convert(part_word)
+                                for part_word in part.split(" ")
+                            ]
+                        ) + ("." if ends_with_dot else ",")
                     num_form = number_form(part)
                     result.append(num2words(part.strip(), lang="uk", gender=gender))
         "qu": "кв",
         "ch": "ч",
         "sh": "ш",
+        "шч": "щ",  # after previous cases
         "ph": "ф",
         "kh": "х",
         "yo": "йо",