Yurii Paniv commited on
Commit
953871c
1 Parent(s): 85f14d6

Add cases for currencies

Browse files
tests/test_formatter.py CHANGED
@@ -19,7 +19,10 @@ import pytest
19
  "11100000001 доларів державного боргу.",
20
  "одинадцять мільярдів сто мільйонів один доларів державного боргу.",
21
  ),
22
- # this is wrong case, should be "це дев'ятнадцяти-річне вино."
 
 
 
23
  # Implementing this, require to have proper parsing of words into the token stream
24
  # which reqiure reworking of current approach.
25
  ("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
19
  "11100000001 доларів державного боргу.",
20
  "одинадцять мільярдів сто мільйонів один доларів державного боргу.",
21
  ),
22
+ ("10000$, 15000 корупціонерів", "десять тисяч доларів , п'ятнадцять тисяч корупціонерів"), # TODO: fix space before comma
23
+ ("$10000, 15000 корупціонерів", "доларів десять тисяч, п'ятнадцять тисяч корупціонерів"), # fix order
24
+ ("10000$ у еквіваленті борщових заправок", "десять тисяч доларів у еквіваленті борщових заправок"),
25
+ # this is wrong case, should be "це дев'ятнадцятирічне вино."
26
  # Implementing this, require to have proper parsing of words into the token stream
27
  # which reqiure reworking of current approach.
28
  ("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
ukrainian_tts/formatter.py CHANGED
@@ -18,6 +18,17 @@ CURRENCY = {
18
  }
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
21
  def preprocess_text(text):
22
  text = text.lower()
23
  # currencies
@@ -57,17 +68,33 @@ def preprocess_text(text):
57
  text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
58
 
59
  def detect_num_and_convert(word):
60
- numbers = "0123456789,."
 
 
61
  result = []
62
  nonlocal num_form
63
  parts = word.split("-") # for handling complex words
64
  for part in parts:
65
- is_number = all(map(lambda x: x in numbers, part))
66
- if is_number:
 
67
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  num_form = number_form(part)
69
- print("-" + part + "-" + str(num_form))
70
- result.append(num2words(part, lang="uk", gender=gender))
71
  except:
72
  result.append(part)
73
  else:
@@ -76,14 +103,8 @@ def preprocess_text(text):
76
 
77
  # print([detect_num_and_convert(word) for word in text.split(" ")])
78
  text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
79
- if currency == "USD":
80
- text = text.replace("$", CURRENCY[currency][num_form])
81
 
82
- if currency == "UAH":
83
- text = text.replace("₴", CURRENCY[currency][num_form])
84
-
85
- if currency == "EUR":
86
- text = text.replace("€", CURRENCY[currency][num_form])
87
 
88
  # fallback numbers
89
  text = text.replace("1", "один ")
@@ -101,8 +122,17 @@ def preprocess_text(text):
101
  "qu": "кв",
102
  "ch": "ч",
103
  "sh": "ш",
 
104
  "ph": "ф",
105
  "kh": "х",
 
 
 
 
 
 
 
 
106
  "a": "а",
107
  "b": "б",
108
  "c": "ц",
18
  }
19
 
20
 
21
+ def replace_currency_with_words(text, currency, num_form):
22
+ if currency == "USD":
23
+ text = text.replace("$", CURRENCY[currency][num_form])
24
+
25
+ if currency == "UAH":
26
+ text = text.replace("₴", CURRENCY[currency][num_form])
27
+
28
+ if currency == "EUR":
29
+ text = text.replace("€", CURRENCY[currency][num_form])
30
+ return text
31
+
32
  def preprocess_text(text):
33
  text = text.lower()
34
  # currencies
68
  text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
69
 
70
  def detect_num_and_convert(word):
71
+ numbers = "0123456789"
72
+ splits = ",."
73
+ currencies = "$₴€"
74
  result = []
75
  nonlocal num_form
76
  parts = word.split("-") # for handling complex words
77
  for part in parts:
78
+ is_number = all(map(lambda x: x in numbers, part)) or (any(map(lambda x: x in numbers, part)) and any(map(lambda x: x in splits, part)))
79
+ is_currency = any(map(lambda x: x in currencies, part)) and any(map(lambda x: x in numbers, part)) # contains both number and currency symbol
80
+ if is_number or is_currency:
81
  try:
82
+ if is_currency:
83
+ cleaned_part = part
84
+
85
+ for part_currency in currencies:
86
+ cleaned_part = cleaned_part.replace(part_currency, f" {part_currency} ").strip() # TODO: replace with regex
87
+
88
+ part = " ".join([detect_num_and_convert(part_word) for part_word in cleaned_part.split(" ")])
89
+
90
+ ends_with_dot = part.endswith(".") # ugly
91
+ ends_with_comma = part.endswith(",")
92
+ if ends_with_comma or ends_with_dot:
93
+ part = part[:-1]
94
+ part = " ".join([detect_num_and_convert(part_word) for part_word in part.split(" ")]) + ("." if ends_with_dot else ",")
95
+
96
  num_form = number_form(part)
97
+ result.append(num2words(part.strip(), lang="uk", gender=gender))
 
98
  except:
99
  result.append(part)
100
  else:
103
 
104
  # print([detect_num_and_convert(word) for word in text.split(" ")])
105
  text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
 
 
106
 
107
+ text = replace_currency_with_words(text, currency, num_form)
 
 
 
 
108
 
109
  # fallback numbers
110
  text = text.replace("1", "один ")
122
  "qu": "кв",
123
  "ch": "ч",
124
  "sh": "ш",
125
+ "шч": "щ", # after previous cases
126
  "ph": "ф",
127
  "kh": "х",
128
+ "yo": "йо",
129
+ "yu": "ю",
130
+ "ya": "я",
131
+ "ye": "є",
132
+ "yi": "ї",
133
+ "zh": "ж",
134
+ "ts": "ц",
135
+ "th": "т",
136
  "a": "а",
137
  "b": "б",
138
  "c": "ц",