p commited on
Commit
c9574d9
โ€ข
0 Parent(s):

Text to speech for 1000+ languages

Browse files
Files changed (7) hide show
  1. .gitattributes +34 -0
  2. .vscode/settings.json +6 -0
  3. README.md +21 -0
  4. app.py +202 -0
  5. lang_code.json +1114 -0
  6. mm_num2word.py +142 -0
  7. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Massively Multilingual Speech (MMS) - Text To Speech
3
+ emoji: ๐ŸŒ
4
+ colorFrom: yellow
5
+ colorTo: gray
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: true
9
+ ---
10
+
11
+ ## Info
12
+ Text to speech for more than 1000+ languages - Using [fairseq](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) MMS TTS and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper.
13
+
14
+ + Language Iso code list (`lang_code.json`) is adapted from
15
+ https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html
16
+
17
+ The dropdown list is quite long, so I have placed some of my friends' frequently used languages at the top. The other 1000+ languages are sorted alphabetically.
18
+
19
+ + `mm_num2word.py` is adapted from https://github.com/hpbyte/Myanmar_Number_to_Words
20
+
21
+ + Other dependencies, please prefer to the `requirements.txt` file.
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Based on example code of https://huggingface.co/facebook/m2m100_1.2B
2
+ # and https://github.com/wannaphong/ttsmms
3
+ # See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md
4
+
5
+ import gradio as gr
6
+ import os
7
+ import re
8
+ import soundfile as sf
9
+
10
+ import json
11
+ import nltk
12
+ from underthesea import sent_tokenize as vie_sent_tokenize # Vietnamese NLP toolkit
13
+ from underthesea import text_normalize as vie_text_normalize
14
+ from nltk import sent_tokenize as nltk_sent_tokenize
15
+ from ttsmms import download
16
+ from ttsmms import TTS
17
+
18
+ from collections import OrderedDict
19
+ import uuid
20
+ import datetime
21
+ import shutil
22
+ from num2words import num2words
23
+
24
+
25
+ this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper.
26
+ Please note that for some languages, it may not pronounce all words correctly (yet).
27
+ """
28
+
29
+ nltk.download("punkt")
30
+
31
+ # Pre-download some languages
32
+ tts_models = {}
33
+ eng_path = download("eng", "./data")
34
+ tts_models["eng"] = eng_path
35
+ vie_path = download("vie", "./data")
36
+ tts_models["vie"] = vie_path
37
+ mya_path = download("mya", "./data")
38
+ tts_models["mya"] = mya_path
39
+
40
+ # Do some work in the user directory...
41
+
42
+ # Load language codes from lang_code.json with ordered keys
43
+ with open("lang_code.json") as f:
44
+ lang_codes = json.load(f, object_pairs_hook=OrderedDict)
45
+
46
+ lang_codes = {
47
+ key + " (" + lang_codes[key] + ")": lang_codes[key] for key in lang_codes}
48
+ # Extract language names
49
+ language_names = list(lang_codes.keys())
50
+
51
+
52
+ def convert_eng_numbers_to_words(text):
53
+ # Find all numbers in the text using regex
54
+ numbers = re.findall(r"\d+", text)
55
+ # Sort numbers in descending order of length
56
+ sorted_numbers = sorted(numbers, key=len, reverse=True)
57
+ print(sorted_numbers)
58
+
59
+ # Replace numbers with their word equivalents
60
+ for number in sorted_numbers:
61
+ number_word = num2words(int(number))
62
+ text = text.replace(number, number_word)
63
+
64
+ return text
65
+
66
+
67
+ def convert_mya_numbers_to_words(text):
68
+ from mm_num2word import mm_num2word, extract_num
69
+
70
+ numbers = extract_num(text)
71
+ sorted_numbers = sorted(numbers, key=len, reverse=True)
72
+ print(sorted_numbers)
73
+
74
+ for n in sorted_numbers:
75
+ text = text.replace(n, mm_num2word(n))
76
+ return text
77
+
78
+
79
+ def prepare_sentences(text, lang="mya"):
80
+ sentences = []
81
+ # pre-process the text for some languages
82
+ if lang.lower() == "mya":
83
+ text = text.replace("\u104A", ",").replace("\u104B", ".")
84
+ text = convert_mya_numbers_to_words(text)
85
+
86
+ if lang.lower() == "eng":
87
+ text = convert_eng_numbers_to_words(text)
88
+
89
+ print("Processed text", text)
90
+
91
+ paragraphs = [paragraph for paragraph in text.split(
92
+ "\n") if paragraph.strip()]
93
+
94
+ if lang.lower() == "vie":
95
+ for paragraph in paragraphs:
96
+ sentences_raw = vie_sent_tokenize(paragraph)
97
+ sentences.extend(
98
+ [
99
+ vie_text_normalize(sentence)
100
+ for sentence in sentences_raw
101
+ if sentence.strip()
102
+ ]
103
+ )
104
+ else:
105
+ sentences = [
106
+ sentence
107
+ for paragraph in paragraphs
108
+ for sentence in nltk_sent_tokenize(paragraph)
109
+ if sentence.strip()
110
+ ]
111
+ return sentences
112
+
113
+
114
+ def list_dir():
115
+ # Get the current directory
116
+ current_dir = os.getcwd()
117
+ print(current_dir)
118
+
119
+ # List all files in the current directory
120
+ files = os.listdir(current_dir)
121
+
122
+ # Filter the list to include only WAV files
123
+ wav_files = [file for file in files if file.endswith(".wav")]
124
+
125
+ # Print the list of WAV files
126
+ for wav_file in wav_files:
127
+ print(wav_file)
128
+
129
+
130
+ def combine_wav(source_dir, stamp):
131
+ # Get a list of all WAV files in the folder
132
+ wav_files = [file for file in os.listdir(
133
+ source_dir) if file.endswith(".wav")]
134
+
135
+ # Sort the files alphabetically to ensure the correct order of combination
136
+ wav_files.sort()
137
+
138
+ # Combine the WAV files
139
+ combined_data = []
140
+ for file in wav_files:
141
+ file_path = os.path.join(source_dir, file)
142
+ data, sr = sf.read(file_path)
143
+ combined_data.extend(data)
144
+
145
+ # Save the combined audio to a new WAV file
146
+ combined_file_path = f"{stamp}.wav"
147
+ sf.write(combined_file_path, combined_data, sr)
148
+
149
+ shutil.rmtree(source_dir)
150
+ list_dir()
151
+
152
+ # Display the combined audio in the Hugging Face Space app
153
+ return combined_file_path
154
+
155
+
156
+ def mms_tts(Input_Text, lang_name="Burmese (mya)"):
157
+ lang_code = lang_codes[lang_name]
158
+
159
+ user_model = download(lang_code, "./data")
160
+ tts = TTS(user_model)
161
+
162
+ sentences = prepare_sentences(Input_Text, lang_code)
163
+
164
+ # output_dir = f"out_{lang_code}"
165
+ current_datetime = datetime.datetime.now()
166
+ timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
167
+
168
+ user_dir = f"u_{timestamp}"
169
+ if os.path.exists(user_dir):
170
+ session_id = str(uuid.uuid4()) # Generate a random session ID
171
+ user_dir = f"u_{session_id}_{timestamp}"
172
+ os.makedirs(user_dir, exist_ok=True)
173
+ print("New user directory", user_dir)
174
+
175
+ for i, sentence in enumerate(sentences):
176
+ tts.synthesis(
177
+ sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
178
+ combined_file_path = combine_wav(user_dir, timestamp)
179
+ return combined_file_path
180
+
181
+
182
+ # common_languages = ["eng", "mya", "vie"] # List of common language codes
183
+ iface = gr.Interface(
184
+ fn=mms_tts,
185
+ title="Massively Multilingual Speech (MMS) - Text To Speech",
186
+ description=this_description,
187
+ inputs=[
188
+ gr.Textbox(lines=5, placeholder="Enter text to speech",
189
+ label="Input text"),
190
+ gr.Dropdown(
191
+ choices=language_names, label="Select language 1,000+", value="Burmese (mya)"
192
+ ),
193
+ ],
194
+ outputs="audio",
195
+ )
196
+ # outputs=[
197
+ # "audio",
198
+ # gr.File(label="Download", type="file", download_to="done.wav")
199
+ # ])
200
+
201
+
202
+ iface.launch()
lang_code.json ADDED
@@ -0,0 +1,1114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Burmese": "mya",
3
+
4
+ "Mon": "mnw",
5
+ "Shan": "shn",
6
+
7
+ "English": "eng",
8
+ "Vietnamese": "vie",
9
+ "Thai": "tha",
10
+ "Thai, Northern": "nod",
11
+ "Indonesian": "ind",
12
+
13
+ "Khmer": "khm",
14
+ "Khmer, Northern": "kxm",
15
+
16
+ "Abidji": "abi",
17
+ "Aceh": "ace",
18
+ "Achagua": "aca",
19
+ "Achang": "acn",
20
+ "Achi": "acr",
21
+ "Acholi": "ach",
22
+ "Achuar-Shiwiar": "acu",
23
+ "Ach\u00e9": "guq",
24
+ "Adele": "ade",
25
+ "Adioukrou": "adj",
26
+ "Agarabi": "agd",
27
+ "Aghul": "agx",
28
+ "Agutaynen": "agn",
29
+ "Ahanta": "aha",
30
+ "Akan": "aka",
31
+ "Akateko": "knj",
32
+ "Akawaio": "ake",
33
+ "Akeu": "aeu",
34
+ "Akha": "ahk",
35
+ "Akoose": "bss",
36
+ "Alangan": "alj",
37
+ "Albanian": "sqi",
38
+ "Altai, Southern": "alt",
39
+ "Alune": "alp",
40
+ "Alur": "alz",
41
+ "Amazigh": "kab",
42
+ "Ambai": "amk",
43
+ "Ambrym, North": "mmg",
44
+ "Amharic": "amh",
45
+ "Amis": "ami",
46
+ "Amuzgo, San Pedro Amuzgos": "azg",
47
+ "Angor": "agg",
48
+ "Anjam": "boj",
49
+ "Anufo": "cko",
50
+ "Anyin": "any",
51
+ "Arabela": "arl",
52
+ "Arabic": "ara",
53
+ "Aralle-Tabulahan": "atq",
54
+ "Aringa": "luc",
55
+ "Armenian, Western": "hyw",
56
+ "Arop-Lokep": "apr",
57
+ "Arosi": "aia",
58
+ "Aruamu": "msy",
59
+ "Ash\u00e1ninka": "cni",
60
+ "Ash\u00e9ninka, Pajonal": "cjo",
61
+ "Ash\u00e9ninka, Pichis": "cpu",
62
+ "Ash\u00e9ninka, Ucayali-Yur\u00faa": "cpb",
63
+ "Assamese": "asm",
64
+ "Asu": "asa",
65
+ "Ateso": "teo",
66
+ "Atti\u00e9": "ati",
67
+ "Aukan": "djk",
68
+ "Avar": "ava",
69
+ "Avatime": "avn",
70
+ "Avokaya": "avu",
71
+ "Awa": "awb",
72
+ "Awa-Cuaiquer": "kwi",
73
+ "Awadhi": "awa",
74
+ "Awaj\u00fan": "agr",
75
+ "Awakateko": "agu",
76
+ "Aymara, Central": "ayr",
77
+ "Ayoreo": "ayo",
78
+ "Ayta, Abellen": "abp",
79
+ "Ayta, Mag-Indi": "blx",
80
+ "Ayta, Mag-antsi": "sgb",
81
+ "Azerbaijani, North": "azj-script_latin",
82
+ "Azerbaijani, South": "azb",
83
+ "Baatonum": "bba",
84
+ "Bada": "bhz",
85
+ "Baelelea": "bvc",
86
+ "Bagheli": "bfy",
87
+ "Bagri": "bgq",
88
+ "Bahnar": "bdq",
89
+ "Baka": "bdh",
90
+ "Bakhti\u00e2ri": "bqi",
91
+ "Bakw\u00e9": "bjw",
92
+ "Balantak": "blz",
93
+ "Bali": "ban",
94
+ "Balochi, Southern": "bcc-script_arabic",
95
+ "Bamanankan": "bam",
96
+ "Bambam": "ptu",
97
+ "Bana": "bcw",
98
+ "Bandial": "bqj",
99
+ "Bantoanon": "bno",
100
+ "Barai": "bbb",
101
+ "Bari": "bfa",
102
+ "Baruga": "bjz",
103
+ "Bashkort": "bak",
104
+ "Basque": "eus",
105
+ "Bassa": "bsq",
106
+ "Batak Angkola": "akb",
107
+ "Batak Dairi": "btd",
108
+ "Batak Karo": "btx",
109
+ "Batak Simalungun": "bts",
110
+ "Batak Toba": "bbc",
111
+ "Bauzi": "bvz",
112
+ "Bedjond": "bjv",
113
+ "Behoa": "bep",
114
+ "Bekwarra": "bkv",
115
+ "Belize English Creole": "bzj",
116
+ "Bemba": "bem",
117
+ "Benga": "bng",
118
+ "Bengali": "ben",
119
+ "Berom": "bom",
120
+ "Bete-Bendi": "btt",
121
+ "Bharia": "bha",
122
+ "Bhatri": "bgw",
123
+ "Bhattiyali": "bht",
124
+ "Biali": "beh",
125
+ "Bidayuh, Bau": "sne",
126
+ "Bikol, Buhi\u2019non": "ubl",
127
+ "Bikol, Central": "bcl",
128
+ "Bimoba": "bim",
129
+ "Binukid": "bkd",
130
+ "Binumarien": "bjr",
131
+ "Birifor, Malba": "bfo",
132
+ "Birifor, Southern": "biv",
133
+ "Bisa": "bib",
134
+ "Bislama": "bis",
135
+ "Bisu": "bzi",
136
+ "Bis\u00e3": "bqp",
137
+ "Blaan, Koronadal": "bpr",
138
+ "Blaan, Sarangani": "bps",
139
+ "Bobo Madar\u00e9, Southern": "bwq",
140
+ "Bodo Parja": "bdv",
141
+ "Boko": "bqc",
142
+ "Bokobaru": "bus",
143
+ "Bola": "bnp",
144
+ "Bomu": "bmq",
145
+ "Bonggi": "bdg",
146
+ "Bora": "boa",
147
+ "Borong": "ksr",
148
+ "Bor\u00f4ro": "bor",
149
+ "Bru, Eastern": "bru",
150
+ "Buamu": "box",
151
+ "Buang, Mapos": "bzh",
152
+ "Bughotu": "bgt",
153
+ "Buglere": "sab",
154
+ "Bulgarian": "bul",
155
+ "Buli": "bwu",
156
+ "Bum": "bmv",
157
+ "Bwanabwana": "tte",
158
+ "Cab\u00e9car": "cjp",
159
+ "Cacua": "cbv",
160
+ "Capanahua": "kaq",
161
+ "Caquinte": "cot",
162
+ "Carapana": "cbc",
163
+ "Carib": "car",
164
+ "Catalan": "cat",
165
+ "Cebuano": "ceb",
166
+ "Cerma": "cme",
167
+ "Chachi": "cbi",
168
+ "Chamacoco": "ceg",
169
+ "Chatino, Eastern Highland": "cly",
170
+ "Chatino, Nopala": "cya",
171
+ "Chechen": "che",
172
+ "Chhattisgarhi": "hne",
173
+ "Chichewa": "nya",
174
+ "Chidigo": "dig",
175
+ "Chiduruma": "dug",
176
+ "Chin, Bawm": "bgr",
177
+ "Chin, Eastern Khumi": "cek",
178
+ "Chin, Falam": "cfm",
179
+ "Chin, Hakha": "cnh",
180
+ "Chin, Matu": "hlt",
181
+ "Chin, M\u00fc\u00fcn": "mwq",
182
+ "Chin, Tedim": "ctd",
183
+ "Chin, Thado": "tcz",
184
+ "Chin, Zyphe": "zyp",
185
+ "Chinantec, Comaltepec": "cco",
186
+ "Chinantec, Lalana": "cnl",
187
+ "Chinantec, Lealao": "cle",
188
+ "Chinantec, Ozumac\u00edn": "chz",
189
+ "Chinantec, Palantla": "cpa",
190
+ "Chinantec, Sochiapam": "cso",
191
+ "Chinantec, Tepetotutla": "cnt",
192
+ "Chinantec, Usila": "cuc",
193
+ "Chinese, Hakka": "hak",
194
+ "Chinese, Min Nan": "nan",
195
+ "Chingoni": "xnj",
196
+ "Chipaya": "cap",
197
+ "Chiquitano": "cax",
198
+ "Chittagonian": "ctg",
199
+ "Chol": "ctu",
200
+ "Chontal, Tabasco": "chf",
201
+ "Chopi": "cce",
202
+ "Chorote, Iyojwa\u2019ja": "crt",
203
+ "Chorote, Iyo\u2019wujwa": "crq",
204
+ "Chuj": "cac-dialect_sanmateoixtat\u00e1n",
205
+ "Chukchi": "ckt",
206
+ "Chumburung": "ncu",
207
+ "Churahi": "cdj",
208
+ "Chuvash": "chv",
209
+ "Ch\u2019orti\u2019": "caa",
210
+ "Cishingini": "asg",
211
+ "Cof\u00e1n": "con",
212
+ "Cora, El Nayar": "crn",
213
+ "Cora, Santa Teresa": "cok",
214
+ "Cree, Plains": "crk-script_syllabics",
215
+ "Crimean Tatar": "crh",
216
+ "Cuiba": "cui",
217
+ "Daasanach": "dsh",
218
+ "Daba": "dbq",
219
+ "Dagaare, Southern": "dga",
220
+ "Dagara, Northern": "dgi",
221
+ "Dagba": "dgk",
222
+ "Dan": "dnj-dialect_blowowest",
223
+ "Dangal\u00e9at": "daa",
224
+ "Dani, Mid Grand Valley": "dnt",
225
+ "Dani, Western": "dnw",
226
+ "Dargwa": "dar",
227
+ "Datooga": "tcc",
228
+ "Dawro": "dwr",
229
+ "Dedua": "ded",
230
+ "Deg": "mzw",
231
+ "Delo": "ntr",
232
+ "Dendi": "ddn",
233
+ "Desano": "des",
234
+ "Desiya": "dso",
235
+ "Dhao": "nfa",
236
+ "Dhimal": "dhi",
237
+ "Dida, Yocobou\u00e9": "gud",
238
+ "Didinga": "did",
239
+ "Digaro-Mishmi": "mhu",
240
+ "Dinka, Northeastern": "dip",
241
+ "Dinka, Southwestern": "dik",
242
+ "Ditammari": "tbz",
243
+ "Dogon, Toro So": "dts",
244
+ "Dogos\u00e9": "dos",
245
+ "Dogri": "dgo",
246
+ "Duri": "mvp",
247
+ "Dutch": "nld",
248
+ "Dza": "jen",
249
+ "Dzongkha": "dzo",
250
+ "Ede Idaca": "idd",
251
+ "Ekajuk": "eka",
252
+ "Embera Cat\u00edo": "cto",
253
+ "Ember\u00e1, Northern": "emp",
254
+ "Enxet": "enx",
255
+ "Epena": "sja",
256
+ "Erzya": "myv",
257
+ "Ese": "mcq",
258
+ "Ese Ejja": "ese",
259
+ "Evenki": "evn",
260
+ "Ezaa": "eza",
261
+ "Fali, South": "fal",
262
+ "Faroese": "fao",
263
+ "Fataleka": "far",
264
+ "Fijian": "fij",
265
+ "Finnish": "fin",
266
+ "Fon": "fon",
267
+ "Fordata": "frd",
268
+ "French": "fra",
269
+ "Fulah": "ful",
270
+ "Fuliiru": "flr",
271
+ "Gadaba, Mudhili": "gau",
272
+ "Gaddi": "gbk",
273
+ "Gagauz": "gag-script_latin",
274
+ "Galela": "gbi",
275
+ "Gamo": "gmv",
276
+ "Ganda": "lug",
277
+ "Gapapaiwa": "pwg",
278
+ "Garhwali": "gbm",
279
+ "Garifuna": "cab",
280
+ "Garo": "grt",
281
+ "Gbaya": "krs",
282
+ "Gbaya, Southwest": "gso",
283
+ "Gela": "nlg",
284
+ "Gen": "gej",
285
+ "German, Standard": "deu",
286
+ "Ghari": "gri",
287
+ "Gikuyu": "kik",
288
+ "Gikyode": "acd",
289
+ "Gilaki": "glk",
290
+ "Gofa": "gof-script_latin",
291
+ "Gogo": "gog",
292
+ "Gokana": "gkn",
293
+ "Gondi, Adilabad": "wsg",
294
+ "Gonja": "gjn",
295
+ "Gor": "gqr",
296
+ "Gorontalo": "gor",
297
+ "Gourmanch\u00e9ma": "gux",
298
+ "Grebo, Northern": "gbo",
299
+ "Greek": "ell",
300
+ "Greek, Ancient": "grc",
301
+ "Guahibo": "guh",
302
+ "Guajaj\u00e1ra": "gub",
303
+ "Guarani": "grn",
304
+ "Guarayu": "gyr",
305
+ "Guayabero": "guo",
306
+ "Gude": "gde",
307
+ "Gujarati": "guj",
308
+ "Gulay": "gvl",
309
+ "Gumuz": "guk",
310
+ "Gungu": "rub",
311
+ "Gwahatike": "dah",
312
+ "Gwere": "gwr",
313
+ "Gwich\u2019in": "gwi",
314
+ "Haitian Creole": "hat",
315
+ "Halbi": "hlb",
316
+ "Hamer-Banna": "amf",
317
+ "Hanga": "hag",
318
+ "Hanunoo": "hnn",
319
+ "Haryanvi": "bgc",
320
+ "Hatam": "had",
321
+ "Hausa": "hau",
322
+ "Hawaii Pidgin": "hwc",
323
+ "Hawu": "hvn",
324
+ "Haya": "hay",
325
+ "Hdi": "xed",
326
+ "Hebrew": "heb",
327
+ "Hehe": "heh",
328
+ "Hiligaynon": "hil",
329
+ "Hindi": "hin",
330
+ "Hindi, Fiji": "hif",
331
+ "Hindustani, Sarnami": "hns",
332
+ "Ho": "hoc",
333
+ "Holiya": "hoy",
334
+ "Huastec": "hus-dialect_centralveracruz",
335
+ "Huave, San Mateo del Mar": "huv",
336
+ "Huli": "hui",
337
+ "Hungarian": "hun",
338
+ "Hupla": "hap",
339
+ "Iban": "iba",
340
+ "Icelandic": "isl",
341
+ "Ida\u2019an": "dbj",
342
+ "Ifugao, Amganad": "ifa",
343
+ "Ifugao, Batad": "ifb",
344
+ "Ifugao, Mayoyao": "ifu",
345
+ "Ifugao, Tuwali": "ifk",
346
+ "If\u00e8": "ife",
347
+ "Ignaciano": "ign",
348
+ "Ika": "ikk",
349
+ "Ikwo": "iqw",
350
+ "Ila": "ilb",
351
+ "Ilocano": "ilo",
352
+ "Imbongu": "imo",
353
+
354
+ "Inga": "inb",
355
+ "Ipili": "ipi",
356
+ "Iraqw": "irk",
357
+ "Islander English Creole": "icr",
358
+ "Itawit": "itv",
359
+ "Itelmen": "itl",
360
+ "Ivbie North-Okpela-Arhe": "atg",
361
+ "Ixil": "ixl-dialect_santamarianebaj",
362
+ "Iyo": "nca",
363
+ "Izere": "izr",
364
+ "Izii": "izz",
365
+ "Jakalteko": "jac",
366
+ "Jamaican English Creole": "jam",
367
+ "Javanese": "jav",
368
+ "Javanese, Suriname": "jvn",
369
+ "Jingpho": "kac",
370
+ "Jola-Fonyi": "dyo",
371
+ "Jola-Kasa": "csk",
372
+ "Jopadhola": "adh",
373
+ "Juang": "jun",
374
+ "Jukun Takum": "jbu",
375
+ "Jula": "dyu",
376
+ "Jur Modo": "bex",
377
+ "Juray": "juy",
378
+ "Kaansa": "gna",
379
+ "Kaapor": "urb",
380
+ "Kabiy\u00e8": "kbp",
381
+ "Kabwa": "cwa",
382
+ "Kadazan Dusun": "dtp",
383
+ "Kafa": "kbr",
384
+ "Kagayanen": "cgc",
385
+ "Kagulu": "kki",
386
+ "Kaili, Da\u2019a": "kzf",
387
+ "Kaili, Ledo": "lew",
388
+ "Kakataibo-Kashibo": "cbr",
389
+ "Kako": "kkj",
390
+ "Kakwa": "keo",
391
+ "Kalagan": "kqe",
392
+ "Kalanguya": "kak",
393
+ "Kalinga, Butbut": "kyb",
394
+ "Kalinga, Lubuagan": "knb",
395
+ "Kalinga, Majukayang": "kmd",
396
+ "Kalinga, Tanudan": "kml",
397
+ "Kallahan, Keley-i": "ify",
398
+ "Kalmyk-Oirat": "xal",
399
+ "Kamano": "kbq",
400
+ "Kamayur\u00e1": "kay",
401
+ "Kambaata": "ktb",
402
+ "Kamwe": "hig",
403
+ "Kandawo": "gam",
404
+ "Kandozi-Chapra": "cbu",
405
+ "Kangri": "xnr",
406
+ "Kanite": "kmu",
407
+ "Kankanaey": "kne",
408
+ "Kannada": "kan",
409
+ "Kanuri, Manga": "kby",
410
+ "Kapampangan": "pam",
411
+ "Kaqchikel": "cak-dialect_central",
412
+ "Karaboro, Eastern": "xrb",
413
+ "Karachay-Balkar": "krc",
414
+ "Karakalpak": "kaa",
415
+ "Karelian": "krl",
416
+ "Karen, Pwo Northern": "pww",
417
+ "Kasem": "xsm",
418
+ "Kashinawa": "cbs",
419
+ "Kaulong": "pss",
420
+ "Kawyaw": "kxf",
421
+ "Kayab\u00ed": "kyz",
422
+ "Kayah, Western": "kyu",
423
+ "Kayap\u00f3": "txu",
424
+ "Kazakh": "kaz",
425
+ "Kebu": "ndp",
426
+ "Keliko": "kbo",
427
+ "Kenga": "kyq",
428
+ "Kenyang": "ken",
429
+ "Kera": "ker",
430
+ "Ketengban": "xte",
431
+ "Keyagana": "kyg",
432
+ "Khakas": "kjh",
433
+ "Khanty": "kca",
434
+ "Khmu": "kjg",
435
+ "Kigiryama": "nyf",
436
+ "Kilivila": "kij",
437
+ "Kim": "kia",
438
+ "Kimaragang": "kqr",
439
+ "Kimr\u00e9": "kqp",
440
+ "Kinaray-a": "krj",
441
+ "Kinga": "zga",
442
+ "Kinyarwanda": "kin",
443
+ "Kipfokomo": "pkb",
444
+ "Kire": "geb",
445
+ "Kiribati": "gil",
446
+ "Kisar": "kje",
447
+ "Kisi, Southern": "kss",
448
+ "Kitharaka": "thk",
449
+ "Klao": "klu",
450
+ "Klon": "kyo",
451
+ "Kogi": "kog",
452
+ "Kolami, Northwestern": "kfb",
453
+ "Komi-Zyrian": "kpv",
454
+ "Konab\u00e9r\u00e9": "bbo",
455
+ "Konkomba": "xon",
456
+ "Konni": "kma",
457
+ "Kono": "kno",
458
+ "Konso": "kxc",
459
+ "Koonzime": "ozm",
460
+ "Koorete": "kqy",
461
+ "Korean": "kor",
462
+ "Koreguaje": "coe",
463
+ "Korupun-Sela": "kpq",
464
+ "Koryak": "kpy",
465
+ "Kouya": "kyf",
466
+ "Koya": "kff-script_telugu",
467
+ "Krio": "kri",
468
+ "Kriol": "rop",
469
+ "Krumen, Plapo": "ktj",
470
+ "Krumen, Tepo": "ted",
471
+ "Krung": "krr",
472
+ "Kuay": "kdt",
473
+ "Kukele": "kez",
474
+ "Kulina": "cul",
475
+ "Kulung": "kle",
476
+ "Kumam": "kdi",
477
+ "Kuman": "kue",
478
+ "Kumyk": "kum",
479
+ "Kuna, Border": "kvn",
480
+ "Kuna, San Blas": "cuk",
481
+ "Kunda": "kdn",
482
+ "Kuo": "xuo",
483
+ "Kupia": "key",
484
+ "Kupsapiiny": "kpz",
485
+ "Kuranko": "knk",
486
+ "Kurdish, Northern": "kmr-script_cyrillic",
487
+ "Kurumba, Alu": "xua",
488
+ "Kurux": "kru",
489
+ "Kusaal": "kus",
490
+ "Kutep": "kub",
491
+ "Kutu": "kdc",
492
+ "Kuvi": "kxv",
493
+ "Kuwaa": "blh",
494
+ "Kuwaataay": "cwt",
495
+ "Kwaio": "kwd",
496
+ "Kwamera": "tnk",
497
+ "Kwara\u2019ae": "kwf",
498
+ "Kwere": "cwe",
499
+ "Kyaka": "kyc",
500
+ "Kyanga": "tye",
501
+ "Kyrgyz": "kir",
502
+ "K\u2019iche\u2019": "quc-dialect_central",
503
+ "Lacandon": "lac",
504
+ "Lacid": "lsi",
505
+ "Ladakhi": "lbj",
506
+ "Lahu": "lhu",
507
+ "Lama": "las",
508
+ "Lamba": "lam",
509
+ "Lamnso\u2019": "lns",
510
+ "Lampung Api": "ljp",
511
+ "Lango": "laj",
512
+ "Lao": "lao",
513
+ "Latin": "lat",
514
+ "Latvian": "lav",
515
+ "Lauje": "law",
516
+ "Lawa, Western": "lcp",
517
+ "Laz": "lzz",
518
+ "Lele": "lln",
519
+ "Lelemi": "lef",
520
+ "Lesser Antillean French Creole": "acf",
521
+ "Lewo": "lww",
522
+ "Lhao Vo": "mhx",
523
+ "Lik": "eip",
524
+ "Limba, West-Central": "lia",
525
+ "Limbu": "lif",
526
+ "Lingao": "onb",
527
+ "Lisu": "lis",
528
+ "Lobala": "loq",
529
+ "Lobi": "lob",
530
+ "Lokaa": "yaz",
531
+ "Loko": "lok",
532
+ "Lole": "llg",
533
+ "Lolopo": "ycl",
534
+ "Loma": "lom",
535
+ "Lomwe": "ngl",
536
+ "Lomwe, Malawi": "lon",
537
+ "Luang": "lex",
538
+ "Lugbara": "lgg",
539
+ "Luguru": "ruf",
540
+ "Lukpa": "dop",
541
+ "Lundayeh": "lnd",
542
+ "Lutos": "ndy",
543
+ "Luwo": "lwo",
544
+ "Ly\u00e9l\u00e9": "lee",
545
+ "Maan": "mev",
546
+ "Mabaan": "mfz",
547
+ "Machame": "jmc",
548
+ "Macuna": "myy",
549
+ "Macushi": "mbc",
550
+ "Mada": "mda",
551
+ "Madura": "mad",
552
+ "Magahi": "mag",
553
+ "Mai Brat": "ayz",
554
+ "Maithili": "mai",
555
+ "Maka": "mca",
556
+ "Makaa": "mcp",
557
+ "Makasar": "mak",
558
+ "Makhuwa": "vmw",
559
+ "Makhuwa-Meetto": "mgh",
560
+ "Makonde": "kde",
561
+ "Malagasy": "mlg",
562
+ "Malay": "zlm",
563
+ "Malay, Central": "pse",
564
+ "Malay, Kupang": "mkn",
565
+ "Malay, Manado": "xmm",
566
+ "Malayalam": "mal",
567
+ "Malayic Dayak": "xdy",
568
+ "Maldivian": "div",
569
+ "Male": "mdy",
570
+ "Malvi": "mup",
571
+ "Mam": "mam-dialect_western",
572
+ "Mamasa": "mqj",
573
+ "Mambila, Cameroon": "mcu",
574
+ "Mambila, Nigeria": "mzk",
575
+ "Mampruli": "maw",
576
+ "Mandeali": "mjl",
577
+ "Mandinka": "mnk",
578
+ "Mango": "mge",
579
+ "Mangseng": "mbh",
580
+ "Mankanya": "knf",
581
+ "Mannan": "mjv",
582
+ "Manobo, Matigsalug": "mbt",
583
+ "Manobo, Obo": "obo",
584
+ "Manobo, Western Bukidnon": "mbb",
585
+ "Manya": "mzj",
586
+ "Mapun": "sjm",
587
+ "Maranao": "mrw",
588
+ "Marathi": "mar",
589
+ "Marba": "mpg",
590
+ "Mari, Meadow": "mhr",
591
+ "Markweeta": "enb",
592
+ "Marshallese": "mah",
593
+ "Masaaba": "myx",
594
+ "Maskelynes": "klv",
595
+ "Matal": "mfh",
596
+ "Mato": "met",
597
+ "Matsigenka": "mcb",
598
+ "Maya, Mop\u00e1n": "mop",
599
+ "Maya, Yucatec": "yua",
600
+ "Mayo": "mfy",
601
+ "Mazahua, Central": "maz",
602
+ "Mazatec, Ayautla": "vmy",
603
+ "Mazatec, Chiquihuitl\u00e1n": "maq",
604
+ "Mazatec, Ixcatl\u00e1n": "mzi",
605
+ "Mazatec, Jalapa de D\u00edaz": "maj",
606
+ "Mazatec, San Jer\u00f3nimo Tec\u00f3atl": "maa-dialect_sanjer\u00f3nimo",
607
+ "Ma\u2019anyan": "mhy",
608
+ "Ma\u2019di": "mhi",
609
+ "Mbandja": "zmz",
610
+ "Mbay": "myb",
611
+ "Mbore": "gai",
612
+ "Mbuko": "mqb",
613
+ "Mbula-Bwazza": "mbu",
614
+ "Melpa": "med",
615
+ "Mende": "men",
616
+ "Mengen": "mee",
617
+ "Mentawai": "mwv",
618
+ "Merey": "meq",
619
+ "Mesme": "zim",
620
+ "Meta\u2019": "mgo",
621
+ "Meyah": "mej",
622
+ "Migabac": "mpp",
623
+ "Minangkabau": "min",
624
+ "Misak": "gum",
625
+ "Misima-Panaeati": "mpx",
626
+ "Mixe, Coatl\u00e1n": "mco",
627
+ "Mixe, Juquila": "mxq",
628
+ "Mixe, Quetzaltepec": "pxm",
629
+ "Mixe, Totontepec": "mto",
630
+ "Mixtec, Alacatlatzala": "mim",
631
+ "Mixtec, Alcozauca": "xta",
632
+ "Mixtec, Amoltepec": "mbz",
633
+ "Mixtec, Apasco-Apoala": "mip",
634
+ "Mixtec, Atatlahuca": "mib",
635
+ "Mixtec, Ayutla": "miy",
636
+ "Mixtec, Chayuco": "mih",
637
+ "Mixtec, Coatzospan": "miz",
638
+ "Mixtec, Diuxi-Tilantongo": "xtd",
639
+ "Mixtec, Jamiltepec": "mxt",
640
+ "Mixtec, Magdalena Pe\u00f1asco": "xtm",
641
+ "Mixtec, Metlat\u00f3noc": "mxv",
642
+ "Mixtec, Northern Tlaxiaco": "xtn",
643
+ "Mixtec, Ocotepec": "mie",
644
+ "Mixtec, Pe\u00f1oles": "mil",
645
+ "Mixtec, Pinotepa Nacional": "mio",
646
+ "Mixtec, Santa Luc\u00eda Monteverde": "mdv",
647
+ "Mixtec, Santa Mar\u00eda Zacatepec": "mza",
648
+ "Mixtec, Southern Puebla": "mit",
649
+ "Mixtec, Tezoatl\u00e1n": "mxb",
650
+ "Mixtec, Yosond\u00faa": "mpm",
651
+ "Miyobe": "soy",
652
+ "Mnong, Central": "cmo-script_khmer",
653
+ "Moba": "mfq",
654
+ "Mochi": "old",
655
+ "Mofu, North": "mfk",
656
+ "Mofu-Gudur": "mif",
657
+ "Mokole": "mkl",
658
+ "Molima": "mox",
659
+ "Moma": "myl",
660
+ "Momuna": "mqf",
661
+ "Mongolian": "mon",
662
+ "Mongondow": "mog",
663
+ "Morisyen": "mfe",
664
+ "Moro": "mor",
665
+ "Moronene": "mqn",
666
+ "Moru": "mgd",
667
+ "Moskona": "mtj",
668
+ "Mro-Khimi": "cmr",
669
+ "Mualang": "mtd",
670
+ "Muinane": "bmr",
671
+ "Mukulu": "moz",
672
+ "Mumuye": "mzm",
673
+ "Muna": "mnb",
674
+ "Mundani": "mnf",
675
+ "Mundari": "unr",
676
+ "Muria, Far Western": "fmu",
677
+ "Murle": "mur",
678
+ "Murut, Timugon": "tih",
679
+ "Muthuvan": "muv",
680
+ "Muyang": "muy",
681
+ "Mwaghavul": "sur",
682
+ "Mwan": "moa",
683
+ "Mwani": "wmw",
684
+ "M\u00e9nik": "tnr",
685
+ "M\u00edskito": "miq",
686
+ "M\u00f2or\u00e9": "mos",
687
+ "M\u00fcnd\u00fc": "muh",
688
+ "Naasioi": "nas",
689
+ "Nad\u00ebb": "mbj",
690
+ "Nafaanra": "nfr",
691
+ "Naga, Kharam": "kfw",
692
+ "Naga, Tangshang": "nst",
693
+ "Nagamese": "nag",
694
+ "Nahuatl, Central Huasteca": "nch",
695
+ "Nahuatl, Eastern Huasteca": "nhe",
696
+ "Nahuatl, Guerrero": "ngu",
697
+ "Nahuatl, Highland Puebla": "azz",
698
+ "Nahuatl, Isthmus-Mecayapan": "nhx",
699
+ "Nahuatl, Michoac\u00e1n": "ncl",
700
+ "Nahuatl, Northern Oaxaca": "nhy",
701
+ "Nahuatl, Northern Puebla": "ncj",
702
+ "Nahuatl, Sierra Negra": "nsu",
703
+ "Nahuatl, Southeastern Puebla": "npl",
704
+ "Nahuatl, Tlamacazapa": "nuz",
705
+ "Nahuatl, Western Huasteca": "nhw",
706
+ "Nahuatl, Zacatl\u00e1n-Ahuacatl\u00e1n-Tepetzintla": "nhi",
707
+ "Nalca": "nlc",
708
+ "Nambiku\u00e1ra, Southern": "nab",
709
+ "Nanai": "gld",
710
+ "Nande": "nnb",
711
+ "Napu": "npy",
712
+ "Nasa": "pbb",
713
+ "Nateni": "ntm",
714
+ "Nawdm": "nmz",
715
+ "Nawuri": "naw",
716
+ "Naxi": "nxq",
717
+ "Ndamba": "ndj",
718
+ "Ndogo": "ndz",
719
+ "Ndut": "ndv",
720
+ "Newar": "new",
721
+ "Ngaju": "nij",
722
+ "Ngambay": "sba",
723
+ "Ngangam": "gng",
724
+ "Ngbaka": "nga",
725
+ "Ngindo": "nnq",
726
+ "Ngulu": "ngp",
727
+ "Ng\u00e4bere": "gym",
728
+ "Ng\u2019akarimojong": "kdj",
729
+ "Nias": "nia",
730
+ "Nilamba": "nim",
731
+ "Ninzo": "nin",
732
+ "Nkonya": "nko",
733
+ "Nogai": "nog",
734
+ "Nomaande": "lem",
735
+ "Nomatsigenga": "not",
736
+ "Noone": "nhu",
737
+ "Ntcham": "bud",
738
+ "Nuer": "nus",
739
+ "Nugunu": "yas",
740
+ "Nuni, Southern": "nnw",
741
+ "Nyabwa": "nwb",
742
+ "Nyakyusa-Ngonde": "nyy",
743
+ "Nyankore": "nyn",
744
+ "Nyaturu": "rim",
745
+ "Nyindrou": "lid",
746
+ "Nyole": "nuj",
747
+ "Nyoro": "nyo",
748
+ "Nzema": "nzi",
749
+ "Obolo": "ann",
750
+ "Odia": "ory",
751
+ "Ojibwa, Northwestern": "ojb-script_syllabics",
752
+ "Oku": "oku",
753
+ "Oniyan": "bsc",
754
+ "Oroko": "bdu",
755
+ "Oromo": "orm",
756
+ "Orya": "ury",
757
+ "Ossetic": "oss",
758
+ "Otomi, Mezquital": "ote",
759
+ "Otomi, Quer\u00e9taro": "otq",
760
+ "Owa": "stn",
761
+ "Paasaal": "sig",
762
+ "Pahari, Kullu": "kfx",
763
+ "Pahari, Mahasu": "bfz",
764
+ "Paicoca": "sey",
765
+ "Paiute, Northern": "pao",
766
+ "Palauan": "pau",
767
+ "Palaung, Ruching": "pce",
768
+ "Palawano, Brooke\u2019s Point": "plw",
769
+ "Pamona": "pmf",
770
+ "Pangasinan": "pag",
771
+ "Papiamentu": "pap",
772
+ "Paranan": "prf",
773
+ "Parec\u00eds": "pab",
774
+ "Parkwa": "pbi",
775
+ "Patamona": "pbc",
776
+ "Paumar\u00ed": "pad",
777
+ "Pele-Ata": "ata",
778
+ "Penan, Eastern": "pez",
779
+ "Pengo": "peg",
780
+ "Persian": "fas",
781
+ "Pidgin, Nigerian": "pcm",
782
+ "Pijin": "pis",
783
+ "Pinyin": "pny",
784
+ "Piratapuyo": "pir",
785
+ "Pitjantjatjara": "pjt",
786
+ "Pogolo": "poy",
787
+ "Polish": "pol",
788
+ "Popoloca, San Lu\u00eds Temalacayuca": "pps",
789
+ "Popoloca, San Marcos Tlacoyalco": "pls",
790
+ "Popoluca, Highland": "poi",
791
+ "Poqomchi\u2019": "poh-dialect_western",
792
+ "Portuguese": "por",
793
+ "Prai": "prt",
794
+ "Puinave": "pui",
795
+ "Punjabi, Eastern": "pan",
796
+ "Purepecha": "tsz",
797
+ "Puroik": "suv",
798
+ "P\u00e9v\u00e9": "lme",
799
+ "Quechua, Ayacucho": "quy",
800
+ "Quechua, Cajamarca": "qvc",
801
+ "Quechua, Cusco": "quz",
802
+ "Quechua, Eastern Apur\u00edmac": "qve",
803
+ "Quechua, Huallaga": "qub",
804
+ "Quechua, Huamal\u00edes-Dos de Mayo Hu\u00e1nuco": "qvh",
805
+ "Quechua, Huaylas Ancash": "qwh",
806
+ "Quechua, Huaylla Wanca": "qvw",
807
+ "Quechua, Lambayeque": "quf",
808
+ "Quechua, Margos-Yarowilca-Lauricocha": "qvm",
809
+ "Quechua, North Bolivian": "qul",
810
+ "Quechua, North Jun\u00edn": "qvn",
811
+ "Quechua, Northern Conchucos Ancash": "qxn",
812
+ "Quechua, Panao": "qxh",
813
+ "Quechua, San Mart\u00edn": "qvs",
814
+ "Quechua, South Bolivian": "quh",
815
+ "Quechua, Southern Conchucos": "qxo",
816
+ "Quichua, Ca\u00f1ar Highland": "qxr",
817
+ "Quichua, Napo": "qvo",
818
+ "Quichua, Northern Pastaza": "qvz",
819
+ "Quichua, Salasaca Highland": "qxl",
820
+ "Quichua, Tena Lowland": "quw",
821
+ "Q\u2019anjob\u2019al": "kjb",
822
+ "Q\u2019eqchi\u2019": "kek",
823
+ "Rabha": "rah",
824
+ "Rajbanshi": "rjs",
825
+ "Ramoaaina": "rai",
826
+ "Rampi": "lje",
827
+ "Ranglong": "rnl",
828
+ "Rangpuri": "rkt",
829
+ "Rapa Nui": "rap",
830
+ "Ravula": "yea",
831
+ "Rawang": "raw",
832
+ "Rejang": "rej",
833
+ "Rendille": "rel",
834
+ "Riang Lang": "ril",
835
+ "Rigwe": "iri",
836
+ "Rikou": "rgu",
837
+ "Rohingya": "rhg",
838
+ "Romani, Carpathian": "rmc-script_cyrillic",
839
+ "Romani, Sinte": "rmo",
840
+ "Romani, Vlax": "rmy-script_cyrillic",
841
+ "Romanian": "ron",
842
+ "Romblomanon": "rol",
843
+ "Ron": "cla",
844
+ "Ronga": "rng",
845
+ "Roviana": "rug",
846
+ "Rundi": "run",
847
+ "Russian": "rus",
848
+ "Saamya-Gwe": "lsm",
849
+ "Sabaot": "spy",
850
+ "Sadri": "sck",
851
+ "Sahu": "saj",
852
+ "Sakachep": "sch",
853
+ "Sama, Central": "sml",
854
+ "Sambal": "xsb",
855
+ "Sambal, Botolan": "sbl",
856
+ "Samburu": "saq",
857
+ "Samo, Southern": "sbd",
858
+ "Samoan": "smo",
859
+ "Sampang": "rav",
860
+ "Sangir": "sxn",
861
+ "Sango": "sag",
862
+ "Sangu": "sbp",
863
+ "Sanum\u00e1": "xsu",
864
+ "Saramaccan": "srm",
865
+ "Sasak": "sas",
866
+ "Sa\u2019a": "apb",
867
+ "Sebat Bet Gurage": "sgw",
868
+ "Sedoa": "tvw",
869
+ "Sekpele": "lip",
870
+ "Selaru": "slu",
871
+ "Selee": "snw",
872
+ "Semai": "sea",
873
+ "Semelai": "sza",
874
+ "Sena": "seh",
875
+ "Seychelles French Creole": "crs",
876
+ "Shambala": "ksb",
877
+ "Shanga": "sho",
878
+ "Sharanahua": "mcd",
879
+ "Shawi": "cbt",
880
+ "Sherpa": "xsr",
881
+ "Shilluk": "shk",
882
+ "Shipibo-Conibo": "shp",
883
+ "Shona": "sna",
884
+ "Shor": "cjs",
885
+ "Shuar": "jiv",
886
+ "Siane": "snp",
887
+ "Siang": "sya",
888
+ "Sidamo": "sid",
889
+ "Siona": "snn",
890
+ "Siriano": "sri",
891
+ "Sirmauri": "srx",
892
+ "Sisaala, Tumulung": "sil",
893
+ "Sissala": "sld",
894
+ "Siwu": "akp",
895
+ "Soga": "xog",
896
+ "Somali": "som",
897
+ "Somba-Siawari": "bmu",
898
+ "Songhay, Koyra Chiini": "khq",
899
+ "Songhay, Koyraboro Senni": "ses",
900
+ "Sougb": "mnx",
901
+ "Spanish": "spa",
902
+ "Sranan Tongo": "srn",
903
+ "Suba": "sxb",
904
+ "Subanon, Western": "suc",
905
+ "Sudest": "tgo",
906
+ "Sukuma": "suk",
907
+ "Sunda": "sun",
908
+ "Sunwar": "suz",
909
+ "Surgujia": "sgj",
910
+ "Susu": "sus",
911
+ "Swahili": "swh",
912
+ "Swedish": "swe",
913
+ "Sylheti": "syl",
914
+ "S\u00e9noufo, Djimini": "dyi",
915
+ "S\u00e9noufo, Mamara": "myk",
916
+ "S\u00e9noufo, Supyire": "spp",
917
+ "Taabwa": "tap",
918
+ "Tabaru": "tby",
919
+ "Tacana": "tna",
920
+ "Tachelhit": "shi",
921
+ "Tado": "klw",
922
+ "Tagalog": "tgl",
923
+ "Tagbanwa, Calamian": "tbk",
924
+ "Tagin": "tgj",
925
+ "Tai Dam": "blt",
926
+ "Tairora, North": "tbg",
927
+ "Tairora, South": "omw",
928
+ "Tajik": "tgk",
929
+ "Tajio": "tdj",
930
+ "Takia": "tbc",
931
+ "Talinga-Bwisi": "tlj",
932
+ "Talysh": "tly",
933
+ "Tamajaq, Tawallammat": "ttq-script_tifinagh",
934
+ "Tamang, Eastern": "taj",
935
+ "Tamasheq": "taq",
936
+ "Tamil": "tam",
937
+ "Tampulma": "tpm",
938
+ "Tangoa": "tgp",
939
+ "Tanna, North": "tnn",
940
+ "Tarahumara, Western": "tac",
941
+ "Tarifit": "rif-script_arabic",
942
+ "Tatar": "tat",
943
+ "Tatuyo": "tav",
944
+ "Tawbuid": "twb",
945
+ "Tboli": "tbl",
946
+ "Tehit": "kps",
947
+ "Teiwa": "twe",
948
+ "Tektiteko": "ttc",
949
+ "Telugu": "tel",
950
+ "Tem": "kdh",
951
+ "Tengger": "tes",
952
+ "Tennet": "tex",
953
+ "Tepehua, Huehuetla": "tee",
954
+ "Tepehua, Pisaflores": "tpp",
955
+ "Tepehua, Tlachichilco": "tpt",
956
+ "Tepehuan, Southeastern": "stp",
957
+ "Teribe": "tfr",
958
+ "Termanu": "twu",
959
+ "Ter\u00eana": "ter",
960
+ "Tewa": "tew",
961
+ "Tharu, Dangaura": "thl",
962
+ "Themne": "tem",
963
+ "Tibetan, Amdo": "adx",
964
+ "Tibetan, Central": "bod",
965
+ "Tibetan, Khams": "khg",
966
+ "Ticuna": "tca",
967
+ "Tigrigna": "tir",
968
+ "Tii": "txq",
969
+ "Tikar": "tik",
970
+ "Tlicho": "dgr",
971
+ "Toba": "tob",
972
+ "Toba-Maskoy": "tmf",
973
+ "Tobanga": "tng",
974
+ "Tobelo": "tlb",
975
+ "Tohono O\u2019odham": "ood",
976
+ "Tok Pisin": "tpi",
977
+ "Tol": "jic",
978
+ "Tolaki": "lbw",
979
+ "Tombonuo": "txa",
980
+ "Tombulu": "tom",
981
+ "Tonga": "toh",
982
+ "Tontemboan": "tnt",
983
+ "Toraja-Sa\u2019dan": "sda",
984
+ "Torres Strait Creole": "tcs",
985
+ "Totonac, Coyutla": "toc",
986
+ "Totonac, Highland": "tos",
987
+ "Toura": "neb",
988
+ "Trinitario": "trn",
989
+ "Triqui, Chicahuaxtla": "trs",
990
+ "Triqui, Copala": "trc",
991
+ "Tri\u00f3": "tri",
992
+ "Tsafiki": "cof",
993
+ "Tsakhur": "tkr",
994
+ "Tsikimba": "kdl",
995
+ "Tsiman\u00e9": "cas",
996
+ "Tsonga": "tso",
997
+ "Tucano": "tuo",
998
+ "Tuma-Irumu": "iou",
999
+ "Tumak": "tmc",
1000
+ "Tunebo, Central": "tuf",
1001
+ "Turkish": "tur",
1002
+ "Turkmen": "tuk-script_arabic",
1003
+ "Tuwuli": "bov",
1004
+ "Tuyuca": "tue",
1005
+ "Tyap": "kcg",
1006
+ "Tzeltal": "tzh-dialect_tenejapa",
1007
+ "Tzotzil": "tzo-dialect_chamula",
1008
+ "Tz\u2019utujil": "tzj-dialect_eastern",
1009
+ "Uab Meto": "aoz",
1010
+ "Udmurt": "udm",
1011
+ "Uduk": "udu",
1012
+ "Ukrainian": "ukr",
1013
+ "Uma": "ppk",
1014
+ "Umbu-Ungu": "ubu",
1015
+ "Urak Lawoi\u2019": "urk",
1016
+ "Urarina": "ura",
1017
+ "Urat": "urt",
1018
+ "Urdu": "urd-script_latin",
1019
+ "Uripiv-Wala-Rano-Atchin": "upv",
1020
+ "Uspanteko": "usp",
1021
+ "Uyghur": "uig-script_cyrillic",
1022
+ "Uzbek": "uzb-script_cyrillic",
1023
+ "Vagla": "vag",
1024
+ "Vengo": "bav",
1025
+ "Vidunda": "vid",
1026
+ "Vili": "vif",
1027
+ "Vunjo": "vun",
1028
+ "Vute": "vut",
1029
+ "Wa, Parauk": "prk",
1030
+ "Waama": "wwa",
1031
+ "Waima": "rro",
1032
+ "Waimaha": "bao",
1033
+ "Waiwai": "waw",
1034
+ "Wala": "lgl",
1035
+ "Wali": "wlx",
1036
+ "Wamey": "cou",
1037
+ "Wamp\u00eds": "hub",
1038
+ "Wanano": "gvc",
1039
+ "Wandala": "mfi",
1040
+ "Wapishana": "wap",
1041
+ "Warao": "wba",
1042
+ "Waray-Waray": "war",
1043
+ "Wayana": "way",
1044
+ "Wayuu": "guc",
1045
+ "Welsh": "cym",
1046
+ "Wersing": "kvw",
1047
+ "Whitesands": "tnp",
1048
+ "Witoto, Minika": "hto",
1049
+ "Witoto, Murui": "huu",
1050
+ "Wolaytta": "wal-script_ethiopic",
1051
+ "Wolio": "wlo",
1052
+ "Woun Meu": "noa",
1053
+ "W\u00e8 Northern": "wob",
1054
+ "Xaasongaxango": "kao",
1055
+ "Xer\u00e9nte": "xer",
1056
+ "Yagua": "yad",
1057
+ "Yakan": "yka",
1058
+ "Yakut": "sah",
1059
+ "Yala": "yba",
1060
+ "Yali, Angguruk": "yli",
1061
+ "Yali, Ninia": "nlk",
1062
+ "Yalunka": "yal",
1063
+ "Yamba": "yam",
1064
+ "Yambeta": "yat",
1065
+ "Yamdena": "jmd",
1066
+ "Yami": "tao",
1067
+ "Yaminahua": "yaa",
1068
+ "Yanesha\u2019": "ame",
1069
+ "Yanomam\u00f6": "guu",
1070
+ "Yao": "yao",
1071
+ "Yaour\u00e9": "yre",
1072
+ "Yawa": "yva",
1073
+ "Yemba": "ybb",
1074
+ "Yine": "pib",
1075
+ "Yipma": "byr",
1076
+ "Yom": "pil",
1077
+ "Yoruba": "yor",
1078
+ "Yucuna": "ycn",
1079
+ "Yupik, Saint Lawrence Island": "ess",
1080
+ "Yuracare": "yuz",
1081
+ "Zaiwa": "atb",
1082
+ "Zande": "zne",
1083
+ "Zapotec, Alo\u00e1pam": "zaq",
1084
+ "Zapotec, Amatl\u00e1n": "zpo",
1085
+ "Zapotec, Cajonos": "zad",
1086
+ "Zapotec, Choapan": "zpc",
1087
+ "Zapotec, Coatecas Altas": "zca",
1088
+ "Zapotec, Guevea de Humboldt": "zpg",
1089
+ "Zapotec, Isthmus": "zai",
1090
+ "Zapotec, Lachix\u00edo": "zpl",
1091
+ "Zapotec, Miahuatl\u00e1n": "zam",
1092
+ "Zapotec, Mitla": "zaw",
1093
+ "Zapotec, Mixtepec": "zpm",
1094
+ "Zapotec, Ocotl\u00e1n": "zac",
1095
+ "Zapotec, Ozolotepec": "zao",
1096
+ "Zapotec, Quioquitani-Quier\u00ed": "ztq",
1097
+ "Zapotec, Rinc\u00f3n": "zar",
1098
+ "Zapotec, San Vicente Coatl\u00e1n": "zpt",
1099
+ "Zapotec, Santa Mar\u00eda Quiegolani": "zpi",
1100
+ "Zapotec, Santo Domingo Albarradas": "zas",
1101
+ "Zapotec, Sierra de Ju\u00e1rez": "zaa",
1102
+ "Zapotec, Texmelucan": "zpz",
1103
+ "Zapotec, Western Tlacolula Valley": "zab",
1104
+ "Zapotec, Yal\u00e1lag": "zpu",
1105
+ "Zapotec, Yareni": "zae",
1106
+ "Zapotec, Yatee": "zty",
1107
+ "Zapotec, Yatzachi": "zav",
1108
+ "Zaza": "zza",
1109
+ "Zhuang, Yongbei": "zyb",
1110
+ "Zigula": "ziw",
1111
+ "Zoque, Francisco Le\u00f3n": "zos",
1112
+ "Zulgo-Gemzek": "gnd",
1113
+ "\u00c9w\u00e9": "ewe"
1114
+ }
mm_num2word.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file is adapted from https://github.com/hpbyte/Myanmar_Number_to_Words
3
+ """
4
+ import re
5
+
6
+ mm_digit = {
7
+ "แ€": "แ€žแ€ฏแ€Š",
8
+ "แ": "แ€แ€…แ€บ",
9
+ "แ‚": "แ€”แ€พแ€…แ€บ",
10
+ "แƒ": "แ€žแ€ฏแ€ถ:",
11
+ "แ„": "แ€œแ€ฑ:",
12
+ "แ…": "แ€„แ€ซ:",
13
+ "แ†": "แ€แ€ผแ€ฑแ€ฌแ€€แ€บ",
14
+ "แ‡": "แ€แ€ฏแ€”แ€พแ€…แ€บ",
15
+ "แˆ": "แ€›แ€พแ€…แ€บ",
16
+ "แ‰": "แ€€แ€ญแ€ฏ:",
17
+ }
18
+
19
+ # regular expressions
20
+ rgxPh = "^(แ€แ|แ€แ‰)"
21
+ rgxDate = "[แ€-แ‰]{1,2}-[แ€-แ‰]{1,2}-[แ€-แ‰]{4}|[แ€-แ‰]{1,2}\/[แ€-แ‰]{1,2}\/[แ€-แ‰]{4}"
22
+ rgxTime = "[แ€-แ‰]{1,2}:[แ€-แ‰]{1,2}"
23
+ rgxDec = "[แ€-แ‰]*\.[แ€-แ‰]*"
24
+ rgxAmt = "[,แ€-แ‰]+"
25
+
26
+
27
+ def convert_digit(num):
28
+ """
29
+ @type num str
30
+ @param num Myanmar number
31
+ @rtype str
32
+ @return converted Myanmar spoken words
33
+ """
34
+
35
+ converted = ""
36
+ nb_digits = len(num)
37
+
38
+ def check_if_zero(pos):
39
+ return not num[-pos] == "แ€"
40
+
41
+ def hundred_thousandth_val():
42
+ n = num[:-5]
43
+ return (
44
+ ("แ€žแ€ญแ€”แ€บ: " + mm_num2word(n))
45
+ if (n[-2:] == "แ€แ€")
46
+ else (mm_num2word(n) + "แ€žแ€ญแ€”แ€บ: ")
47
+ )
48
+
49
+ def thousandth_val():
50
+ return mm_digit[num[-4]] + ("แ€‘แ€ฑแ€ฌแ€„แ€บ " if (num[-3:] == "แ€แ€แ€") else "แ€‘แ€ฑแ€ฌแ€„แ€บแ€ท ")
51
+
52
+ def hundredth_val():
53
+ return mm_digit[num[-3]] + (
54
+ "แ€›แ€ฌแ€ท "
55
+ if (
56
+ (num[-2] == "แ€" and re.match(r"[แ-แ‰]", num[-1]))
57
+ or (re.match(r"[แ-แ‰]", num[-2]) and num[-1] == "แ€")
58
+ )
59
+ else "แ€›แ€ฌ "
60
+ )
61
+
62
+ def tenth_val():
63
+ return ("" if (num[-2] == "แ") else mm_digit[num[-2]]) + (
64
+ "แ€†แ€šแ€บ " if (num[-1] == "แ€") else "แ€†แ€šแ€บแ€ท "
65
+ )
66
+
67
+ if nb_digits > 5:
68
+ converted += hundred_thousandth_val()
69
+ if (nb_digits > 4) and check_if_zero(5):
70
+ converted += mm_digit[num[-5]] + "แ€žแ€ฑแ€ฌแ€„แ€บ: "
71
+ if (nb_digits > 3) and check_if_zero(4):
72
+ converted += thousandth_val()
73
+ if (nb_digits > 2) and check_if_zero(3):
74
+ converted += hundredth_val()
75
+ if (nb_digits > 1) and check_if_zero(2):
76
+ converted += tenth_val()
77
+ if (nb_digits > 0) and check_if_zero(1):
78
+ converted += mm_digit[num[-1]]
79
+
80
+ return converted
81
+
82
+
83
+ def mm_num2word(num):
84
+ """
85
+ Detect type of number and convert accordingly
86
+
87
+ @type num str
88
+ @param num Myanmar number
89
+ @rtype str
90
+ @return converted Myanmar spoken words
91
+ """
92
+
93
+ word = ""
94
+
95
+ # phone number
96
+ if re.match(r"" + rgxPh, num[:2]):
97
+ word = " ".join([(mm_digit[d] if not d == "แ‡" else "แ€แ€ฝแ€”แ€บ") for d in num])
98
+ # date
99
+ elif re.match(r"" + rgxDate, num):
100
+ n = re.split(r"-|/", num)
101
+ word = (
102
+ convert_digit(n[-1])
103
+ + " แ€แ€ฏแ€”แ€พแ€…แ€บ "
104
+ + convert_digit(n[1])
105
+ + " แ€œแ€•แ€ญแ€ฏแ€„แ€บ: "
106
+ + convert_digit(n[0])
107
+ + " แ€›แ€€แ€บ"
108
+ )
109
+ # time
110
+ elif re.match(r"" + rgxTime, num):
111
+ n = re.split(r":", num)
112
+ word = (convert_digit(n[0]) + " แ€”แ€ฌแ€›แ€ฎ ") + (
113
+ "แ€แ€ฝแ€ฒ" if (n[1] == "แƒแ€") else (convert_digit(n[1]) + " แ€™แ€ญแ€”แ€…แ€บ")
114
+ )
115
+ # decimal
116
+ elif re.match(r"" + rgxDec, num):
117
+ n = re.split(r"\.", num)
118
+ word = convert_digit(n[0]) + " แ€’แ€ฟแ€™ " + " ".join([mm_digit[d] for d in n[1]])
119
+ # amount
120
+ elif re.match(r"" + rgxAmt, num):
121
+ word = convert_digit(num.replace(",", ""))
122
+ # default
123
+ else:
124
+ raise Exception("Cannot convert the provided number format!")
125
+
126
+ return word
127
+
128
+
129
+ def extract_num(S):
130
+ """
131
+ Extract numbers from the input string
132
+
133
+ @type S str
134
+ @param S Myanmar sentence
135
+ @rtype list
136
+ @return a list of Myanmar numbers
137
+ """
138
+ matchedNums = re.compile(
139
+ "%s|%s|%s|%s" % (rgxDate, rgxTime, rgxDec, rgxAmt)
140
+ ).findall(S)
141
+
142
+ return matchedNums
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ttsmms
2
+ underthesea
3
+ nltk
4
+ soundfile
5
+ num2words