p commited on
Commit
ba0fb36
1 Parent(s): 7b6aa43

enable some langs supported by num2words

Browse files
Files changed (2) hide show
  1. app.py +9 -5
  2. num2words_lang_map.json +29 -0
app.py CHANGED
@@ -47,8 +47,12 @@ lang_codes = {key + " (" + lang_codes[key] + ")": lang_codes[key] for key in lan
47
  # Extract language names
48
  language_names = list(lang_codes.keys())
49
 
 
 
 
50
 
51
- def convert_eng_numbers_to_words(text):
 
52
  # Find all numbers in the text using regex
53
  numbers = re.findall(r"\d+", text)
54
  # Sort numbers in descending order of length
@@ -57,7 +61,7 @@ def convert_eng_numbers_to_words(text):
57
 
58
  # Replace numbers with their word equivalents
59
  for number in sorted_numbers:
60
- number_word = num2words(int(number))
61
  text = text.replace(number, number_word)
62
 
63
  return text
@@ -82,9 +86,9 @@ def prepare_sentences(text, lang="mya"):
82
  text = convert_mya_numbers_to_words(text)
83
  text = text.replace("\u104A", ",").replace("\u104B", ".")
84
 
85
- if lang.lower() == "eng":
86
- text = convert_eng_numbers_to_words(text)
87
-
88
  print("Processed text", text)
89
 
90
  paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
 
47
  # Extract language names
48
  language_names = list(lang_codes.keys())
49
 
50
+ # Load num2words_lang_map
51
+ with open("num2words_lang_map.json") as f:
52
+ num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict)
53
 
54
+
55
+ def convert_numbers_to_words_num2words(text, lang):
56
  # Find all numbers in the text using regex
57
  numbers = re.findall(r"\d+", text)
58
  # Sort numbers in descending order of length
 
61
 
62
  # Replace numbers with their word equivalents
63
  for number in sorted_numbers:
64
+ number_word = num2words(int(number), lang=num2words_lang_map[lang][0])
65
  text = text.replace(number, number_word)
66
 
67
  return text
 
86
  text = convert_mya_numbers_to_words(text)
87
  text = text.replace("\u104A", ",").replace("\u104B", ".")
88
 
89
+ if lang in num2words_lang_map:
90
+ print("num2words supports this lang", lang)
91
+ text = convert_numbers_to_words_num2words(text, lang)
92
  print("Processed text", text)
93
 
94
  paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
num2words_lang_map.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eng": ["en", "English, default"],
3
+ "amh": ["am", "Amharic"],
4
+ "ara": ["ar", "Arabic"],
5
+ "deu": ["de", "German"],
6
+ "spa": ["es", "Spanish"],
7
+ "fas": ["fa", "Farsi"],
8
+ "fin": ["fi", "Finnish"],
9
+ "fra": ["fr", "French"],
10
+ "heb": ["he", "Hebrew"],
11
+ "hun": ["hu", "Hungarian"],
12
+ "ind": ["id", "Indonesian"],
13
+ "isl": ["is", "Icelandic"],
14
+ "kan": ["kn", "Kannada"],
15
+ "kor": ["ko", "Korean"],
16
+ "kaz": ["kz", "Kazakh"],
17
+ "lav": ["lv", "Latvian"],
18
+ "pol": ["pl", "Polish"],
19
+ "swe": ["sv", "Swedish"],
20
+ "ron": ["ro", "Romanian"],
21
+ "rus": ["ru", "Russian"],
22
+ "tel": ["te", "Telugu"],
23
+ "tgk": ["tg", "Tajik"],
24
+ "tur": ["tr", "Turkish"],
25
+ "tha": ["th", "Thai"],
26
+ "vie": ["vi", "Vietnamese"],
27
+ "nld": ["nl", "Dutch"],
28
+ "ukr": ["uk", "Ukrainian"]
29
+ }