Spaces:

robinhad
/

ukrainian-tts

Running

App Files Files Community

Andrii Kurdiumov Yurii Paniv commited on Aug 28, 2023

Commit

6180416

•

1 Parent(s): d663a5a

Use simple tokenizer (#34)

Browse files

* Fix couple cases

* Update formatter.py

---------

Co-authored-by: Yurii Paniv <mr.robinhad@gmail.com>

Files changed (2) hide show

tests/test_formatter.py +27 -7
ukrainian_tts/formatter.py +30 -6

tests/test_formatter.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from ukrainian_tts.formatter import preprocess_text
 import pytest
 @pytest.mark.parametrize(
     "text,expected",
     [
@@ -21,22 +20,43 @@ import pytest
         ),
         (
             "10000$, 15000 корупціонерів",
-            "десять тисяч доларів , п'ятнадцять тисяч корупціонерів",
-        ),  # TODO: fix space before comma
         (
             "$10000, 15000 корупціонерів",
-            "доларів десять тисяч, п'ятнадцять тисяч корупціонерів",
-        ),  # fix order
         (
             "10000$ у еквіваленті борщових заправок",
             "десять тисяч доларів у еквіваленті борщових заправок",
         ),
         # this is wrong case, should be "це дев'ятнадцятирічне вино."
         # Implementing this, require to have proper parsing of words into the token stream
         # which reqiure reworking of current approach.
         ("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
-        ("10-30-40-50-5-9-5", "десять-тридцять-сорок-п'ятдесят-п'ять-дев'ять-п'ять"),
     ],
 )
-def test_formatter(text, expected):
     assert preprocess_text(text) == expected

 from ukrainian_tts.formatter import preprocess_text
 import pytest
 @pytest.mark.parametrize(
     "text,expected",
     [
         ),
         (
             "10000$, 15000 корупціонерів",
+            "десять тисяч доларів, п'ятнадцять тисяч корупціонерів",
+        ),
+        (
+            "10000 $, 15000 корупціонерів",
+            "десять тисяч доларів, п'ятнадцять тисяч корупціонерів",
+        ),
         (
             "$10000, 15000 корупціонерів",
+            "десять тисяч доларів, п'ятнадцять тисяч корупціонерів",
+        ),
         (
             "10000$ у еквіваленті борщових заправок",
             "десять тисяч доларів у еквіваленті борщових заправок",
         ),
+        ("10-30-40-50-5-9-5", "десять-тридцять-сорок-п'ятдесят-п'ять-дев'ять-п'ять"),
+    ],
+)
+def test_formatter(text, expected):
+    assert preprocess_text(text) == expected
+# Purspose of these tests, to have clearly separate list of issues
+# in the conversion. Once fixed, these cases should move to test_formatter
+# We still want make sure that no changes happens there, as any regressions
+# is bad, or interesting.
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        # Should be два долара
+        (
+            "2 $, 15000 корупціонерів",
+            "два доларів, п'ятнадцять тисяч корупціонерів",
+        ),
         # this is wrong case, should be "це дев'ятнадцятирічне вино."
         # Implementing this, require to have proper parsing of words into the token stream
         # which reqiure reworking of current approach.
         ("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
     ],
 )
+def test_planned_formatter_issues(text, expected):
     assert preprocess_text(text) == expected

ukrainian_tts/formatter.py CHANGED Viewed

@@ -29,6 +29,28 @@ def replace_currency_with_words(text, currency, num_form):
         text = text.replace("€", CURRENCY[currency][num_form])
     return text
 def preprocess_text(text):
     text = text.lower()
@@ -89,9 +111,12 @@ def preprocess_text(text):
                         cleaned_part = part
                         for part_currency in currencies:
-                            cleaned_part = cleaned_part.replace(
-                                part_currency, f" {part_currency} "
-                            ).strip()  # TODO: replace with regex
                         part = " ".join(
                             [
@@ -119,9 +144,8 @@ def preprocess_text(text):
                 result.append(part)
         return "-".join(result)
-    # print([detect_num_and_convert(word) for word in text.split(" ")])
-    text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
     text = replace_currency_with_words(text, currency, num_form)
     # fallback numbers

         text = text.replace("€", CURRENCY[currency][num_form])
     return text
+def find_any_char(text: str, find: str, start: int):
+    result = -1
+    for c in find:
+        index = text.find(c, start)
+        if (index >= 0) and (result > index or result == -1):
+            result = index
+    return result
+# Have to check if I can use https://github.com/lang-uk/tokenize-uk
+def simple_tokenizer(text: str):
+    start = 0
+    index = find_any_char(text, " ,", start)
+    while (index >= 0):
+        word = text[start:index]
+        yield word
+        separator = text[index]
+        yield separator
+        start = index + 1
+        index = find_any_char(text, " ,", start)
+    yield text[start:]
 def preprocess_text(text):
     text = text.lower()
                         cleaned_part = part
                         for part_currency in currencies:
+                            if cleaned_part[0] == part_currency:
+                                cleaned_part = cleaned_part[1:] + " " + part_currency
+                            else:
+                                cleaned_part = cleaned_part.replace(
+                                    part_currency, f" {part_currency} "
+                                ).strip()  # TODO: replace with regex
                         part = " ".join(
                             [
                 result.append(part)
         return "-".join(result)
+    # print([detect_num_and_convert(word) for word in simple_tokenizer(text)])
+    text = "".join([detect_num_and_convert(word) for word in simple_tokenizer(text)])
     text = replace_currency_with_words(text, currency, num_form)
     # fallback numbers