Spaces:
Daextream
/
Runtime error

Wasghington dpc commited on
Commit
567073a
0 Parent(s):

Duplicate from dpc/mmstts

Browse files

Co-authored-by: Cuong Ph Dang <dpc@users.noreply.huggingface.co>

Files changed (9) hide show
  1. .gitattributes +34 -0
  2. .vscode/settings.json +6 -0
  3. README.md +21 -0
  4. app.py +214 -0
  5. lang_code.txt +1144 -0
  6. mm_num2word.py +142 -0
  7. num2words_lang_map.json +29 -0
  8. requirements.txt +5 -0
  9. test.py +19 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Massively Multilingual Speech (MMS) - Text To Speech
3
+ emoji: 🌍
4
+ colorFrom: yellow
5
+ colorTo: gray
6
+ sdk: gradio
7
+ app_file: app.py
8
+ duplicated_from: dpc/mmstts
9
+ ---
10
+
11
+ ## Info
12
+ Text to speech for more than 1000+ languages - Using [fairseq](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) MMS TTS and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper.
13
+
14
+ + Language Iso code list (`lang_code.txt`) is adapted from
15
+ https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html
16
+
17
+ The dropdown list is quite long, so I have placed some of my friends' frequently used languages at the top. The other 1000+ languages are sorted alphabetically.
18
+
19
+ + `mm_num2word.py` is adapted from https://github.com/hpbyte/Myanmar_Number_to_Words
20
+
21
+ + Other dependencies, please prefer to the `requirements.txt` file.
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Based on example code of https://huggingface.co/facebook/m2m100_1.2B
2
+ # and https://github.com/wannaphong/ttsmms
3
+ # See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md
4
+
5
+ import gradio as gr
6
+ import os
7
+ import re
8
+ import soundfile as sf
9
+
10
+ import json
11
+ import nltk
12
+ from underthesea import sent_tokenize as vie_sent_tokenize # Vietnamese NLP toolkit
13
+ from underthesea import text_normalize as vie_text_normalize
14
+ from nltk import sent_tokenize as nltk_sent_tokenize
15
+ from ttsmms import download
16
+ from ttsmms import TTS
17
+
18
+ from collections import OrderedDict
19
+ import uuid
20
+ import datetime
21
+ import shutil
22
+ from num2words import num2words
23
+
24
+
25
+ this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper.
26
+ Please note that for some languages, it may not pronounce all words correctly (yet).
27
+ """
28
+
29
+ nltk.download("punkt")
30
+
31
+ # Pre-download some languages
32
+ tts_models = {}
33
+ eng_path = download("eng", "./data")
34
+ tts_models["eng"] = eng_path
35
+ vie_path = download("vie", "./data")
36
+ tts_models["vie"] = vie_path
37
+ mya_path = download("mya", "./data")
38
+ tts_models["mya"] = mya_path
39
+
40
+ lang_codes = OrderedDict()
41
+
42
+ language_names = list(lang_codes.keys())
43
+ with open("lang_code.txt", "r") as file:
44
+ for line in file:
45
+ line = line.strip()
46
+ if line.startswith("----"):
47
+ continue
48
+ iso, lang = line.split("\t", 1)
49
+ lang_codes[lang + " (" + iso + ")"] = iso
50
+
51
+ language_names = list(lang_codes.keys())
52
+
53
+ # Load num2words_lang_map
54
+ with open("num2words_lang_map.json") as f:
55
+ num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict)
56
+
57
+
58
+ def convert_numbers_to_words_num2words(text, lang):
59
+ # Find all numbers in the text using regex
60
+ numbers = re.findall(r"\d+", text)
61
+ # Sort numbers in descending order of length
62
+ sorted_numbers = sorted(numbers, key=len, reverse=True)
63
+ print(sorted_numbers)
64
+
65
+ # Replace numbers with their word equivalents
66
+ for number in sorted_numbers:
67
+ number_word = num2words(int(number), lang=num2words_lang_map[lang][0])
68
+ text = text.replace(number, number_word)
69
+
70
+ return text
71
+
72
+
73
+ def convert_mya_numbers_to_words(text):
74
+ from mm_num2word import mm_num2word, extract_num
75
+
76
+ numbers = extract_num(text)
77
+ sorted_numbers = sorted(numbers, key=len, reverse=True)
78
+ print(sorted_numbers)
79
+
80
+ for n in sorted_numbers:
81
+ text = text.replace(n, mm_num2word(n))
82
+ return text
83
+
84
+
85
+ def prepare_sentences(text, lang="mya"):
86
+ sentences = []
87
+ # pre-process the text for some languages
88
+ if lang.lower() == "mya":
89
+ text = convert_mya_numbers_to_words(text)
90
+ text = text.replace("\u104A", ",").replace("\u104B", ".")
91
+
92
+ if lang in num2words_lang_map:
93
+ print("num2words supports this lang", lang)
94
+ text = convert_numbers_to_words_num2words(text, lang)
95
+ print("Processed text", text)
96
+
97
+ # Not sure why this can fix unclear pronunciation for the first word of vie
98
+ text = text.lower()
99
+
100
+ paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
101
+
102
+ if lang.lower() == "vie":
103
+ for paragraph in paragraphs:
104
+ sentences_raw = vie_sent_tokenize(paragraph)
105
+ sentences.extend(
106
+ [
107
+ vie_text_normalize(sentence)
108
+ for sentence in sentences_raw
109
+ if sentence.strip()
110
+ ]
111
+ )
112
+ else:
113
+ sentences = [
114
+ sentence
115
+ for paragraph in paragraphs
116
+ for sentence in nltk_sent_tokenize(paragraph)
117
+ if sentence.strip()
118
+ ]
119
+ return sentences
120
+
121
+
122
+ def list_dir(lang):
123
+ # Get the current directory
124
+ current_dir = os.getcwd()
125
+ print(current_dir)
126
+
127
+ # List all files in the current directory
128
+ files = os.listdir(current_dir)
129
+
130
+ # Filter the list to include only WAV files
131
+ wav_files = [file for file in files if file.endswith(".wav")]
132
+ print("Total wav files:", len(wav_files))
133
+
134
+ # Print the last WAV file
135
+ sorted_list = sorted(wav_files)
136
+ print(lang, sorted_list[-1])
137
+
138
+
139
+ def combine_wav(source_dir, stamp, lang):
140
+ # Get a list of all WAV files in the folder
141
+ wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
142
+
143
+ # Sort the files alphabetically to ensure the correct order of combination
144
+ wav_files.sort()
145
+
146
+ # Combine the WAV files
147
+ combined_data = []
148
+ for file in wav_files:
149
+ file_path = os.path.join(source_dir, file)
150
+ data, sr = sf.read(file_path)
151
+ combined_data.extend(data)
152
+
153
+ # Save the combined audio to a new WAV file
154
+ combined_file_path = f"{stamp}_{lang}.wav"
155
+ sf.write(combined_file_path, combined_data, sr)
156
+
157
+ shutil.rmtree(source_dir)
158
+ list_dir(lang)
159
+
160
+ # Display the combined audio in the Hugging Face Space app
161
+ return combined_file_path
162
+
163
+
164
+ def mms_tts(Input_Text, lang_name="Burmese (mya)"):
165
+ # lang_code = lang_codes[lang_name]
166
+ try:
167
+ lang_code = lang_codes[lang_name]
168
+ except KeyError:
169
+ lang_code = "mya"
170
+
171
+ user_model = download(lang_code, "./data")
172
+ tts = TTS(user_model)
173
+
174
+ sentences = prepare_sentences(Input_Text, lang_code)
175
+
176
+ # output_dir = f"out_{lang_code}"
177
+ current_datetime = datetime.datetime.now()
178
+ timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
179
+
180
+ user_dir = f"u_{timestamp}"
181
+ if os.path.exists(user_dir):
182
+ session_id = str(uuid.uuid4()) # Generate a random session ID
183
+ user_dir = f"u_{session_id}_{timestamp}"
184
+ os.makedirs(user_dir, exist_ok=True)
185
+ print("New user directory", user_dir)
186
+
187
+ for i, sentence in enumerate(sentences):
188
+ tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
189
+ combined_file_path = combine_wav(user_dir, timestamp, lang_code)
190
+ return combined_file_path
191
+
192
+
193
+ # common_languages = ["eng", "mya", "vie"] # List of common language codes
194
+ iface = gr.Interface(
195
+ fn=mms_tts,
196
+ title="Massively Multilingual Speech (MMS) - Text To Speech",
197
+ description=this_description,
198
+ inputs=[
199
+ gr.Textbox(lines=5, placeholder="Enter text (unlimited sentences)", label="Input text (unlimited sentences)"),
200
+ gr.Dropdown(
201
+ choices=language_names,
202
+ label="Select language 1,000+",
203
+ value="Burmese (mya)",
204
+ ),
205
+ ],
206
+ outputs="audio",
207
+ )
208
+ # outputs=[
209
+ # "audio",
210
+ # gr.File(label="Download", type="file", download_to="done.wav")
211
+ # ])
212
+
213
+
214
+ iface.launch()
lang_code.txt ADDED
@@ -0,0 +1,1144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mya Burmese
2
+ mnw Mon
3
+ shn Shan
4
+ eng English
5
+ vie Vietnamese
6
+ tha Thai
7
+ nod Thai, Northern
8
+ ind Indonesian
9
+ khm Khmer
10
+ kxm Khmer, Northern
11
+ --------------
12
+ abi Abidji
13
+ ace Aceh
14
+ aca Achagua
15
+ acn Achang
16
+ acr Achi
17
+ ach Acholi
18
+ acu Achuar-Shiwiar
19
+ guq Aché
20
+ ade Adele
21
+ adj Adioukrou
22
+ agd Agarabi
23
+ agx Aghul
24
+ agn Agutaynen
25
+ aha Ahanta
26
+ aka Akan
27
+ knj Akateko
28
+ ake Akawaio
29
+ aeu Akeu
30
+ ahk Akha
31
+ bss Akoose
32
+ alj Alangan
33
+ sqi Albanian
34
+ alt Altai, Southern
35
+ alp Alune
36
+ alz Alur
37
+ kab Amazigh
38
+ amk Ambai
39
+ mmg Ambrym, North
40
+ amh Amharic
41
+ ami Amis
42
+ azg Amuzgo, San Pedro Amuzgos
43
+ agg Angor
44
+ boj Anjam
45
+ cko Anufo
46
+ any Anyin
47
+ arl Arabela
48
+ ara Arabic
49
+ atq Aralle-Tabulahan
50
+ luc Aringa
51
+ hyw Armenian, Western
52
+ apr Arop-Lokep
53
+ aia Arosi
54
+ msy Aruamu
55
+ cni Asháninka
56
+ cjo Ashéninka, Pajonal
57
+ cpu Ashéninka, Pichis
58
+ cpb Ashéninka, Ucayali-Yurúa
59
+ asm Assamese
60
+ asa Asu
61
+ teo Ateso
62
+ ati Attié
63
+ djk Aukan
64
+ ava Avar
65
+ avn Avatime
66
+ avu Avokaya
67
+ awb Awa
68
+ kwi Awa-Cuaiquer
69
+ awa Awadhi
70
+ agr Awajún
71
+ agu Awakateko
72
+ ayr Aymara, Central
73
+ ayo Ayoreo
74
+ abp Ayta, Abellen
75
+ blx Ayta, Mag-Indi
76
+ sgb Ayta, Mag-antsi
77
+ azj-script_cyrillic Azerbaijani, North
78
+ azj-script_latin Azerbaijani, North
79
+ azb Azerbaijani, South
80
+ bba Baatonum
81
+ bhz Bada
82
+ bvc Baelelea
83
+ bfy Bagheli
84
+ bgq Bagri
85
+ bdq Bahnar
86
+ bdh Baka
87
+ bqi Bakhtiâri
88
+ bjw Bakwé
89
+ blz Balantak
90
+ ban Bali
91
+ bcc-script_latin Balochi, Southern
92
+ bcc-script_arabic Balochi, Southern
93
+ bam Bamanankan
94
+ ptu Bambam
95
+ bcw Bana
96
+ bqj Bandial
97
+ bno Bantoanon
98
+ bbb Barai
99
+ bfa Bari
100
+ bjz Baruga
101
+ bak Bashkort
102
+ eus Basque
103
+ bsq Bassa
104
+ akb Batak Angkola
105
+ btd Batak Dairi
106
+ btx Batak Karo
107
+ bts Batak Simalungun
108
+ bbc Batak Toba
109
+ bvz Bauzi
110
+ bjv Bedjond
111
+ bep Behoa
112
+ bkv Bekwarra
113
+ bzj Belize English Creole
114
+ bem Bemba
115
+ bng Benga
116
+ ben Bengali
117
+ bom Berom
118
+ btt Bete-Bendi
119
+ bha Bharia
120
+ bgw Bhatri
121
+ bht Bhattiyali
122
+ beh Biali
123
+ sne Bidayuh, Bau
124
+ ubl Bikol, Buhi’non
125
+ bcl Bikol, Central
126
+ bim Bimoba
127
+ bkd Binukid
128
+ bjr Binumarien
129
+ bfo Birifor, Malba
130
+ biv Birifor, Southern
131
+ bib Bisa
132
+ bis Bislama
133
+ bzi Bisu
134
+ bqp Bisã
135
+ bpr Blaan, Koronadal
136
+ bps Blaan, Sarangani
137
+ bwq Bobo Madaré, Southern
138
+ bdv Bodo Parja
139
+ bqc Boko
140
+ bus Bokobaru
141
+ bnp Bola
142
+ bmq Bomu
143
+ bdg Bonggi
144
+ boa Bora
145
+ ksr Borong
146
+ bor Borôro
147
+ bru Bru, Eastern
148
+ box Buamu
149
+ bzh Buang, Mapos
150
+ bgt Bughotu
151
+ sab Buglere
152
+ bul Bulgarian
153
+ bwu Buli
154
+ bmv Bum
155
+ tte Bwanabwana
156
+ cjp Cabécar
157
+ cbv Cacua
158
+ kaq Capanahua
159
+ cot Caquinte
160
+ cbc Carapana
161
+ car Carib
162
+ cat Catalan
163
+ ceb Cebuano
164
+ cme Cerma
165
+ cbi Chachi
166
+ ceg Chamacoco
167
+ cly Chatino, Eastern Highland
168
+ cya Chatino, Nopala
169
+ che Chechen
170
+ hne Chhattisgarhi
171
+ nya Chichewa
172
+ dig Chidigo
173
+ dug Chiduruma
174
+ bgr Chin, Bawm
175
+ cek Chin, Eastern Khumi
176
+ cfm Chin, Falam
177
+ cnh Chin, Hakha
178
+ hlt Chin, Matu
179
+ mwq Chin, Müün
180
+ ctd Chin, Tedim
181
+ tcz Chin, Thado
182
+ zyp Chin, Zyphe
183
+ cco Chinantec, Comaltepec
184
+ cnl Chinantec, Lalana
185
+ cle Chinantec, Lealao
186
+ chz Chinantec, Ozumacín
187
+ cpa Chinantec, Palantla
188
+ cso Chinantec, Sochiapam
189
+ cnt Chinantec, Tepetotutla
190
+ cuc Chinantec, Usila
191
+ hak Chinese, Hakka
192
+ nan Chinese, Min Nan
193
+ xnj Chingoni
194
+ cap Chipaya
195
+ cax Chiquitano
196
+ ctg Chittagonian
197
+ ctu Chol
198
+ chf Chontal, Tabasco
199
+ cce Chopi
200
+ crt Chorote, Iyojwa’ja
201
+ crq Chorote, Iyo’wujwa
202
+ cac-dialect_sansebastiáncoatán Chuj
203
+ cac-dialect_sanmateoixtatán Chuj
204
+ ckt Chukchi
205
+ ncu Chumburung
206
+ cdj Churahi
207
+ chv Chuvash
208
+ caa Ch’orti’
209
+ asg Cishingini
210
+ con Cofán
211
+ crn Cora, El Nayar
212
+ cok Cora, Santa Teresa
213
+ crk-script_latin Cree, Plains
214
+ crk-script_syllabics Cree, Plains
215
+ crh Crimean Tatar
216
+ cui Cuiba
217
+ dsh Daasanach
218
+ dbq Daba
219
+ dga Dagaare, Southern
220
+ dgi Dagara, Northern
221
+ dgk Dagba
222
+ dnj-dialect_gweetaawueast Dan
223
+ dnj-dialect_blowowest Dan
224
+ daa Dangaléat
225
+ dnt Dani, Mid Grand Valley
226
+ dnw Dani, Western
227
+ dar Dargwa
228
+ tcc Datooga
229
+ dwr Dawro
230
+ ded Dedua
231
+ mzw Deg
232
+ ntr Delo
233
+ ddn Dendi
234
+ des Desano
235
+ dso Desiya
236
+ nfa Dhao
237
+ dhi Dhimal
238
+ gud Dida, Yocoboué
239
+ did Didinga
240
+ mhu Digaro-Mishmi
241
+ dip Dinka, Northeastern
242
+ dik Dinka, Southwestern
243
+ tbz Ditammari
244
+ dts Dogon, Toro So
245
+ dos Dogosé
246
+ dgo Dogri
247
+ mvp Duri
248
+ nld Dutch
249
+ jen Dza
250
+ dzo Dzongkha
251
+ idd Ede Idaca
252
+ eka Ekajuk
253
+ cto Embera Catío
254
+ emp Emberá, Northern
255
+ enx Enxet
256
+ sja Epena
257
+ myv Erzya
258
+ mcq Ese
259
+ ese Ese Ejja
260
+ evn Evenki
261
+ eza Ezaa
262
+ fal Fali, South
263
+ fao Faroese
264
+ far Fataleka
265
+ fij Fijian
266
+ fin Finnish
267
+ fon Fon
268
+ frd Fordata
269
+ fra French
270
+ ful Fulah
271
+ flr Fuliiru
272
+ gau Gadaba, Mudhili
273
+ gbk Gaddi
274
+ gag-script_cyrillic Gagauz
275
+ gag-script_latin Gagauz
276
+ gbi Galela
277
+ gmv Gamo
278
+ lug Ganda
279
+ pwg Gapapaiwa
280
+ gbm Garhwali
281
+ cab Garifuna
282
+ grt Garo
283
+ krs Gbaya
284
+ gso Gbaya, Southwest
285
+ nlg Gela
286
+ gej Gen
287
+ deu German, Standard
288
+ gri Ghari
289
+ kik Gikuyu
290
+ acd Gikyode
291
+ glk Gilaki
292
+ gof-script_latin Gofa
293
+ gog Gogo
294
+ gkn Gokana
295
+ wsg Gondi, Adilabad
296
+ gjn Gonja
297
+ gqr Gor
298
+ gor Gorontalo
299
+ gux Gourmanchéma
300
+ gbo Grebo, Northern
301
+ ell Greek
302
+ grc Greek, Ancient
303
+ guh Guahibo
304
+ gub Guajajára
305
+ grn Guarani
306
+ gyr Guarayu
307
+ guo Guayabero
308
+ gde Gude
309
+ guj Gujarati
310
+ gvl Gulay
311
+ guk Gumuz
312
+ rub Gungu
313
+ dah Gwahatike
314
+ gwr Gwere
315
+ gwi Gwich’in
316
+ hat Haitian Creole
317
+ hlb Halbi
318
+ amf Hamer-Banna
319
+ hag Hanga
320
+ hnn Hanunoo
321
+ bgc Haryanvi
322
+ had Hatam
323
+ hau Hausa
324
+ hwc Hawaii Pidgin
325
+ hvn Hawu
326
+ hay Haya
327
+ xed Hdi
328
+ heb Hebrew
329
+ heh Hehe
330
+ hil Hiligaynon
331
+ hin Hindi
332
+ hif Hindi, Fiji
333
+ hns Hindustani, Sarnami
334
+ hoc Ho
335
+ hoy Holiya
336
+ hus-dialect_westernpotosino Huastec
337
+ hus-dialect_centralveracruz Huastec
338
+ huv Huave, San Mateo del Mar
339
+ hui Huli
340
+ hun Hungarian
341
+ hap Hupla
342
+ iba Iban
343
+ isl Icelandic
344
+ dbj Ida’an
345
+ ifa Ifugao, Amganad
346
+ ifb Ifugao, Batad
347
+ ifu Ifugao, Mayoyao
348
+ ifk Ifugao, Tuwali
349
+ ife Ifè
350
+ ign Ignaciano
351
+ ikk Ika
352
+ iqw Ikwo
353
+ ilb Ila
354
+ ilo Ilocano
355
+ imo Imbongu
356
+ inb Inga
357
+ ipi Ipili
358
+ irk Iraqw
359
+ icr Islander English Creole
360
+ itv Itawit
361
+ itl Itelmen
362
+ atg Ivbie North-Okpela-Arhe
363
+ ixl-dialect_sanjuancotzal Ixil
364
+ ixl-dialect_sangasparchajul Ixil
365
+ ixl-dialect_santamarianebaj Ixil
366
+ nca Iyo
367
+ izr Izere
368
+ izz Izii
369
+ jac Jakalteko
370
+ jam Jamaican English Creole
371
+ jav Javanese
372
+ jvn Javanese, Suriname
373
+ kac Jingpho
374
+ dyo Jola-Fonyi
375
+ csk Jola-Kasa
376
+ adh Jopadhola
377
+ jun Juang
378
+ jbu Jukun Takum
379
+ dyu Jula
380
+ bex Jur Modo
381
+ juy Juray
382
+ gna Kaansa
383
+ urb Kaapor
384
+ kbp Kabiyè
385
+ cwa Kabwa
386
+ dtp Kadazan Dusun
387
+ kbr Kafa
388
+ cgc Kagayanen
389
+ kki Kagulu
390
+ kzf Kaili, Da’a
391
+ lew Kaili, Ledo
392
+ cbr Kakataibo-Kashibo
393
+ kkj Kako
394
+ keo Kakwa
395
+ kqe Kalagan
396
+ kak Kalanguya
397
+ kyb Kalinga, Butbut
398
+ knb Kalinga, Lubuagan
399
+ kmd Kalinga, Majukayang
400
+ kml Kalinga, Tanudan
401
+ ify Kallahan, Keley-i
402
+ xal Kalmyk-Oirat
403
+ kbq Kamano
404
+ kay Kamayurá
405
+ ktb Kambaata
406
+ hig Kamwe
407
+ gam Kandawo
408
+ cbu Kandozi-Chapra
409
+ xnr Kangri
410
+ kmu Kanite
411
+ kne Kankanaey
412
+ kan Kannada
413
+ kby Kanuri, Manga
414
+ pam Kapampangan
415
+ cak-dialect_santamaríadejesús Kaqchikel
416
+ cak-dialect_southcentral Kaqchikel
417
+ cak-dialect_yepocapa Kaqchikel
418
+ cak-dialect_western Kaqchikel
419
+ cak-dialect_santodomingoxenacoj Kaqchikel
420
+ cak-dialect_central Kaqchikel
421
+ xrb Karaboro, Eastern
422
+ krc Karachay-Balkar
423
+ kaa Karakalpak
424
+ krl Karelian
425
+ pww Karen, Pwo Northern
426
+ xsm Kasem
427
+ cbs Kashinawa
428
+ pss Kaulong
429
+ kxf Kawyaw
430
+ kyz Kayabí
431
+ kyu Kayah, Western
432
+ txu Kayapó
433
+ kaz Kazakh
434
+ ndp Kebu
435
+ kbo Keliko
436
+ kyq Kenga
437
+ ken Kenyang
438
+ ker Kera
439
+ xte Ketengban
440
+ kyg Keyagana
441
+ kjh Khakas
442
+ kca Khanty
443
+ kjg Khmu
444
+ nyf Kigiryama
445
+ kij Kilivila
446
+ kia Kim
447
+ kqr Kimaragang
448
+ kqp Kimré
449
+ krj Kinaray-a
450
+ zga Kinga
451
+ kin Kinyarwanda
452
+ pkb Kipfokomo
453
+ geb Kire
454
+ gil Kiribati
455
+ kje Kisar
456
+ kss Kisi, Southern
457
+ thk Kitharaka
458
+ klu Klao
459
+ kyo Klon
460
+ kog Kogi
461
+ kfb Kolami, Northwestern
462
+ kpv Komi-Zyrian
463
+ bbo Konabéré
464
+ xon Konkomba
465
+ kma Konni
466
+ kno Kono
467
+ kxc Konso
468
+ ozm Koonzime
469
+ kqy Koorete
470
+ kor Korean
471
+ coe Koreguaje
472
+ kpq Korupun-Sela
473
+ kpy Koryak
474
+ kyf Kouya
475
+ kff-script_telugu Koya
476
+ kri Krio
477
+ rop Kriol
478
+ ktj Krumen, Plapo
479
+ ted Krumen, Tepo
480
+ krr Krung
481
+ kdt Kuay
482
+ kez Kukele
483
+ cul Kulina
484
+ kle Kulung
485
+ kdi Kumam
486
+ kue Kuman
487
+ kum Kumyk
488
+ kvn Kuna, Border
489
+ cuk Kuna, San Blas
490
+ kdn Kunda
491
+ xuo Kuo
492
+ key Kupia
493
+ kpz Kupsapiiny
494
+ knk Kuranko
495
+ kmr-script_latin Kurdish, Northern
496
+ kmr-script_arabic Kurdish, Northern
497
+ kmr-script_cyrillic Kurdish, Northern
498
+ xua Kurumba, Alu
499
+ kru Kurux
500
+ kus Kusaal
501
+ kub Kutep
502
+ kdc Kutu
503
+ kxv Kuvi
504
+ blh Kuwaa
505
+ cwt Kuwaataay
506
+ kwd Kwaio
507
+ tnk Kwamera
508
+ kwf Kwara’ae
509
+ cwe Kwere
510
+ kyc Kyaka
511
+ tye Kyanga
512
+ kir Kyrgyz
513
+ quc-dialect_north K’iche’
514
+ quc-dialect_east K’iche’
515
+ quc-dialect_central K’iche’
516
+ lac Lacandon
517
+ lsi Lacid
518
+ lbj Ladakhi
519
+ lhu Lahu
520
+ las Lama
521
+ lam Lamba
522
+ lns Lamnso’
523
+ ljp Lampung Api
524
+ laj Lango
525
+ lao Lao
526
+ lat Latin
527
+ lav Latvian
528
+ law Lauje
529
+ lcp Lawa, Western
530
+ lzz Laz
531
+ lln Lele
532
+ lef Lelemi
533
+ acf Lesser Antillean French Creole
534
+ lww Lewo
535
+ mhx Lhao Vo
536
+ eip Lik
537
+ lia Limba, West-Central
538
+ lif Limbu
539
+ onb Lingao
540
+ lis Lisu
541
+ loq Lobala
542
+ lob Lobi
543
+ yaz Lokaa
544
+ lok Loko
545
+ llg Lole
546
+ ycl Lolopo
547
+ lom Loma
548
+ ngl Lomwe
549
+ lon Lomwe, Malawi
550
+ lex Luang
551
+ lgg Lugbara
552
+ ruf Luguru
553
+ dop Lukpa
554
+ lnd Lundayeh
555
+ ndy Lutos
556
+ lwo Luwo
557
+ lee Lyélé
558
+ mev Maan
559
+ mfz Mabaan
560
+ jmc Machame
561
+ myy Macuna
562
+ mbc Macushi
563
+ mda Mada
564
+ mad Madura
565
+ mag Magahi
566
+ ayz Mai Brat
567
+ mai Maithili
568
+ mca Maka
569
+ mcp Makaa
570
+ mak Makasar
571
+ vmw Makhuwa
572
+ mgh Makhuwa-Meetto
573
+ kde Makonde
574
+ mlg Malagasy
575
+ zlm Malay
576
+ pse Malay, Central
577
+ mkn Malay, Kupang
578
+ xmm Malay, Manado
579
+ mal Malayalam
580
+ xdy Malayic Dayak
581
+ div Maldivian
582
+ mdy Male
583
+ mup Malvi
584
+ mam-dialect_central Mam
585
+ mam-dialect_northern Mam
586
+ mam-dialect_southern Mam
587
+ mam-dialect_western Mam
588
+ mqj Mamasa
589
+ mcu Mambila, Cameroon
590
+ mzk Mambila, Nigeria
591
+ maw Mampruli
592
+ mjl Mandeali
593
+ mnk Mandinka
594
+ mge Mango
595
+ mbh Mangseng
596
+ knf Mankanya
597
+ mjv Mannan
598
+ mbt Manobo, Matigsalug
599
+ obo Manobo, Obo
600
+ mbb Manobo, Western Bukidnon
601
+ mzj Manya
602
+ sjm Mapun
603
+ mrw Maranao
604
+ mar Marathi
605
+ mpg Marba
606
+ mhr Mari, Meadow
607
+ enb Markweeta
608
+ mah Marshallese
609
+ myx Masaaba
610
+ klv Maskelynes
611
+ mfh Matal
612
+ met Mato
613
+ mcb Matsigenka
614
+ mop Maya, Mopán
615
+ yua Maya, Yucatec
616
+ mfy Mayo
617
+ maz Mazahua, Central
618
+ vmy Mazatec, Ayautla
619
+ maq Mazatec, Chiquihuitlán
620
+ mzi Mazatec, Ixcatlán
621
+ maj Mazatec, Jalapa de Díaz
622
+ maa-dialect_sanantonio Mazatec, San Jerónimo Tecóatl
623
+ maa-dialect_sanjerónimo Mazatec, San Jerónimo Tecóatl
624
+ mhy Ma’anyan
625
+ mhi Ma’di
626
+ zmz Mbandja
627
+ myb Mbay
628
+ gai Mbore
629
+ mqb Mbuko
630
+ mbu Mbula-Bwazza
631
+ med Melpa
632
+ men Mende
633
+ mee Mengen
634
+ mwv Mentawai
635
+ meq Merey
636
+ zim Mesme
637
+ mgo Meta’
638
+ mej Meyah
639
+ mpp Migabac
640
+ min Minangkabau
641
+ gum Misak
642
+ mpx Misima-Panaeati
643
+ mco Mixe, Coatlán
644
+ mxq Mixe, Juquila
645
+ pxm Mixe, Quetzaltepec
646
+ mto Mixe, Totontepec
647
+ mim Mixtec, Alacatlatzala
648
+ xta Mixtec, Alcozauca
649
+ mbz Mixtec, Amoltepec
650
+ mip Mixtec, Apasco-Apoala
651
+ mib Mixtec, Atatlahuca
652
+ miy Mixtec, Ayutla
653
+ mih Mixtec, Chayuco
654
+ miz Mixtec, Coatzospan
655
+ xtd Mixtec, Diuxi-Tilantongo
656
+ mxt Mixtec, Jamiltepec
657
+ xtm Mixtec, Magdalena Peñasco
658
+ mxv Mixtec, Metlatónoc
659
+ xtn Mixtec, Northern Tlaxiaco
660
+ mie Mixtec, Ocotepec
661
+ mil Mixtec, Peñoles
662
+ mio Mixtec, Pinotepa Nacional
663
+ mdv Mixtec, Santa Lucía Monteverde
664
+ mza Mixtec, Santa María Zacatepec
665
+ mit Mixtec, Southern Puebla
666
+ mxb Mixtec, Tezoatlán
667
+ mpm Mixtec, Yosondúa
668
+ soy Miyobe
669
+ cmo-script_latin Mnong, Central
670
+ cmo-script_khmer Mnong, Central
671
+ mfq Moba
672
+ old Mochi
673
+ mfk Mofu, North
674
+ mif Mofu-Gudur
675
+ mkl Mokole
676
+ mox Molima
677
+ myl Moma
678
+ mqf Momuna
679
+ mon Mongolian
680
+ mog Mongondow
681
+ mfe Morisyen
682
+ mor Moro
683
+ mqn Moronene
684
+ mgd Moru
685
+ mtj Moskona
686
+ cmr Mro-Khimi
687
+ mtd Mualang
688
+ bmr Muinane
689
+ moz Mukulu
690
+ mzm Mumuye
691
+ mnb Muna
692
+ mnf Mundani
693
+ unr Mundari
694
+ fmu Muria, Far Western
695
+ mur Murle
696
+ tih Murut, Timugon
697
+ muv Muthuvan
698
+ muy Muyang
699
+ sur Mwaghavul
700
+ moa Mwan
701
+ wmw Mwani
702
+ tnr Ménik
703
+ miq Mískito
704
+ mos Mòoré
705
+ muh Mündü
706
+ nas Naasioi
707
+ mbj Nadëb
708
+ nfr Nafaanra
709
+ kfw Naga, Kharam
710
+ nst Naga, Tangshang
711
+ nag Nagamese
712
+ nch Nahuatl, Central Huasteca
713
+ nhe Nahuatl, Eastern Huasteca
714
+ ngu Nahuatl, Guerrero
715
+ azz Nahuatl, Highland Puebla
716
+ nhx Nahuatl, Isthmus-Mecayapan
717
+ ncl Nahuatl, Michoacán
718
+ nhy Nahuatl, Northern Oaxaca
719
+ ncj Nahuatl, Northern Puebla
720
+ nsu Nahuatl, Sierra Negra
721
+ npl Nahuatl, Southeastern Puebla
722
+ nuz Nahuatl, Tlamacazapa
723
+ nhw Nahuatl, Western Huasteca
724
+ nhi Nahuatl, Zacatlán-Ahuacatlán-Tepetzintla
725
+ nlc Nalca
726
+ nab Nambikuára, Southern
727
+ gld Nanai
728
+ nnb Nande
729
+ npy Napu
730
+ pbb Nasa
731
+ ntm Nateni
732
+ nmz Nawdm
733
+ naw Nawuri
734
+ nxq Naxi
735
+ ndj Ndamba
736
+ ndz Ndogo
737
+ ndv Ndut
738
+ new Newar
739
+ nij Ngaju
740
+ sba Ngambay
741
+ gng Ngangam
742
+ nga Ngbaka
743
+ nnq Ngindo
744
+ ngp Ngulu
745
+ gym Ngäbere
746
+ kdj Ng’akarimojong
747
+ nia Nias
748
+ nim Nilamba
749
+ nin Ninzo
750
+ nko Nkonya
751
+ nog Nogai
752
+ lem Nomaande
753
+ not Nomatsigenga
754
+ nhu Noone
755
+ bud Ntcham
756
+ nus Nuer
757
+ yas Nugunu
758
+ nnw Nuni, Southern
759
+ nwb Nyabwa
760
+ nyy Nyakyusa-Ngonde
761
+ nyn Nyankore
762
+ rim Nyaturu
763
+ lid Nyindrou
764
+ nuj Nyole
765
+ nyo Nyoro
766
+ nzi Nzema
767
+ ann Obolo
768
+ ory Odia
769
+ ojb-script_latin Ojibwa, Northwestern
770
+ ojb-script_syllabics Ojibwa, Northwestern
771
+ oku Oku
772
+ bsc Oniyan
773
+ bdu Oroko
774
+ orm Oromo
775
+ ury Orya
776
+ oss Ossetic
777
+ ote Otomi, Mezquital
778
+ otq Otomi, Querétaro
779
+ stn Owa
780
+ sig Paasaal
781
+ kfx Pahari, Kullu
782
+ bfz Pahari, Mahasu
783
+ sey Paicoca
784
+ pao Paiute, Northern
785
+ pau Palauan
786
+ pce Palaung, Ruching
787
+ plw Palawano, Brooke’s Point
788
+ pmf Pamona
789
+ pag Pangasinan
790
+ pap Papiamentu
791
+ prf Paranan
792
+ pab Parecís
793
+ pbi Parkwa
794
+ pbc Patamona
795
+ pad Paumarí
796
+ ata Pele-Ata
797
+ pez Penan, Eastern
798
+ peg Pengo
799
+ fas Persian
800
+ pcm Pidgin, Nigerian
801
+ pis Pijin
802
+ pny Pinyin
803
+ pir Piratapuyo
804
+ pjt Pitjantjatjara
805
+ poy Pogolo
806
+ pol Polish
807
+ pps Popoloca, San Luís Temalacayuca
808
+ pls Popoloca, San Marcos Tlacoyalco
809
+ poi Popoluca, Highland
810
+ poh-dialect_eastern Poqomchi’
811
+ poh-dialect_western Poqomchi’
812
+ por Portuguese
813
+ prt Prai
814
+ pui Puinave
815
+ pan Punjabi, Eastern
816
+ tsz Purepecha
817
+ suv Puroik
818
+ lme Pévé
819
+ quy Quechua, Ayacucho
820
+ qvc Quechua, Cajamarca
821
+ quz Quechua, Cusco
822
+ qve Quechua, Eastern Apurímac
823
+ qub Quechua, Huallaga
824
+ qvh Quechua, Huamalíes-Dos de Mayo Huánuco
825
+ qwh Quechua, Huaylas Ancash
826
+ qvw Quechua, Huaylla Wanca
827
+ quf Quechua, Lambayeque
828
+ qvm Quechua, Margos-Yarowilca-Lauricocha
829
+ qul Quechua, North Bolivian
830
+ qvn Quechua, North Junín
831
+ qxn Quechua, Northern Conchucos Ancash
832
+ qxh Quechua, Panao
833
+ qvs Quechua, San Martín
834
+ quh Quechua, South Bolivian
835
+ qxo Quechua, Southern Conchucos
836
+ qxr Quichua, Cañar Highland
837
+ qvo Quichua, Napo
838
+ qvz Quichua, Northern Pastaza
839
+ qxl Quichua, Salasaca Highland
840
+ quw Quichua, Tena Lowland
841
+ kjb Q’anjob’al
842
+ kek Q’eqchi’
843
+ rah Rabha
844
+ rjs Rajbanshi
845
+ rai Ramoaaina
846
+ lje Rampi
847
+ rnl Ranglong
848
+ rkt Rangpuri
849
+ rap Rapa Nui
850
+ yea Ravula
851
+ raw Rawang
852
+ rej Rejang
853
+ rel Rendille
854
+ ril Riang Lang
855
+ iri Rigwe
856
+ rgu Rikou
857
+ rhg Rohingya
858
+ rmc-script_latin Romani, Carpathian
859
+ rmc-script_cyrillic Romani, Carpathian
860
+ rmo Romani, Sinte
861
+ rmy-script_latin Romani, Vlax
862
+ rmy-script_cyrillic Romani, Vlax
863
+ ron Romanian
864
+ rol Romblomanon
865
+ cla Ron
866
+ rng Ronga
867
+ rug Roviana
868
+ run Rundi
869
+ rus Russian
870
+ lsm Saamya-Gwe
871
+ spy Sabaot
872
+ sck Sadri
873
+ saj Sahu
874
+ sch Sakachep
875
+ sml Sama, Central
876
+ xsb Sambal
877
+ sbl Sambal, Botolan
878
+ saq Samburu
879
+ sbd Samo, Southern
880
+ smo Samoan
881
+ rav Sampang
882
+ sxn Sangir
883
+ sag Sango
884
+ sbp Sangu
885
+ xsu Sanumá
886
+ srm Saramaccan
887
+ sas Sasak
888
+ apb Sa’a
889
+ sgw Sebat Bet Gurage
890
+ tvw Sedoa
891
+ lip Sekpele
892
+ slu Selaru
893
+ snw Selee
894
+ sea Semai
895
+ sza Semelai
896
+ seh Sena
897
+ crs Seychelles French Creole
898
+ ksb Shambala
899
+ sho Shanga
900
+ mcd Sharanahua
901
+ cbt Shawi
902
+ xsr Sherpa
903
+ shk Shilluk
904
+ shp Shipibo-Conibo
905
+ sna Shona
906
+ cjs Shor
907
+ jiv Shuar
908
+ snp Siane
909
+ sya Siang
910
+ sid Sidamo
911
+ snn Siona
912
+ sri Siriano
913
+ srx Sirmauri
914
+ sil Sisaala, Tumulung
915
+ sld Sissala
916
+ akp Siwu
917
+ xog Soga
918
+ som Somali
919
+ bmu Somba-Siawari
920
+ khq Songhay, Koyra Chiini
921
+ ses Songhay, Koyraboro Senni
922
+ mnx Sougb
923
+ spa Spanish
924
+ srn Sranan Tongo
925
+ sxb Suba
926
+ suc Subanon, Western
927
+ tgo Sudest
928
+ suk Sukuma
929
+ sun Sunda
930
+ suz Sunwar
931
+ sgj Surgujia
932
+ sus Susu
933
+ swh Swahili
934
+ swe Swedish
935
+ syl Sylheti
936
+ dyi Sénoufo, Djimini
937
+ myk Sénoufo, Mamara
938
+ spp Sénoufo, Supyire
939
+ tap Taabwa
940
+ tby Tabaru
941
+ tna Tacana
942
+ shi Tachelhit
943
+ klw Tado
944
+ tgl Tagalog
945
+ tbk Tagbanwa, Calamian
946
+ tgj Tagin
947
+ blt Tai Dam
948
+ tbg Tairora, North
949
+ omw Tairora, South
950
+ tgk Tajik
951
+ tdj Tajio
952
+ tbc Takia
953
+ tlj Talinga-Bwisi
954
+ tly Talysh
955
+ ttq-script_tifinagh Tamajaq, Tawallammat
956
+ taj Tamang, Eastern
957
+ taq Tamasheq
958
+ tam Tamil
959
+ tpm Tampulma
960
+ tgp Tangoa
961
+ tnn Tanna, North
962
+ tac Tarahumara, Western
963
+ rif-script_latin Tarifit
964
+ rif-script_arabic Tarifit
965
+ tat Tatar
966
+ tav Tatuyo
967
+ twb Tawbuid
968
+ tbl Tboli
969
+ kps Tehit
970
+ twe Teiwa
971
+ ttc Tektiteko
972
+ tel Telugu
973
+ kdh Tem
974
+ tes Tengger
975
+ tex Tennet
976
+ tee Tepehua, Huehuetla
977
+ tpp Tepehua, Pisaflores
978
+ tpt Tepehua, Tlachichilco
979
+ stp Tepehuan, Southeastern
980
+ tfr Teribe
981
+ twu Termanu
982
+ ter Terêna
983
+ tew Tewa
984
+ thl Tharu, Dangaura
985
+ tem Themne
986
+ adx Tibetan, Amdo
987
+ bod Tibetan, Central
988
+ khg Tibetan, Khams
989
+ tca Ticuna
990
+ tir Tigrigna
991
+ txq Tii
992
+ tik Tikar
993
+ dgr Tlicho
994
+ tob Toba
995
+ tmf Toba-Maskoy
996
+ tng Tobanga
997
+ tlb Tobelo
998
+ ood Tohono O’odham
999
+ tpi Tok Pisin
1000
+ jic Tol
1001
+ lbw Tolaki
1002
+ txa Tombonuo
1003
+ tom Tombulu
1004
+ toh Tonga
1005
+ tnt Tontemboan
1006
+ sda Toraja-Sa’dan
1007
+ tcs Torres Strait Creole
1008
+ toc Totonac, Coyutla
1009
+ tos Totonac, Highland
1010
+ neb Toura
1011
+ trn Trinitario
1012
+ trs Triqui, Chicahuaxtla
1013
+ trc Triqui, Copala
1014
+ tri Trió
1015
+ cof Tsafiki
1016
+ tkr Tsakhur
1017
+ kdl Tsikimba
1018
+ cas Tsimané
1019
+ tso Tsonga
1020
+ tuo Tucano
1021
+ iou Tuma-Irumu
1022
+ tmc Tumak
1023
+ tuf Tunebo, Central
1024
+ tur Turkish
1025
+ tuk-script_latin Turkmen
1026
+ tuk-script_arabic Turkmen
1027
+ bov Tuwuli
1028
+ tue Tuyuca
1029
+ kcg Tyap
1030
+ tzh-dialect_bachajón Tzeltal
1031
+ tzh-dialect_tenejapa Tzeltal
1032
+ tzo-dialect_chenalhó Tzotzil
1033
+ tzo-dialect_chamula Tzotzil
1034
+ tzj-dialect_western Tz’utujil
1035
+ tzj-dialect_eastern Tz’utujil
1036
+ aoz Uab Meto
1037
+ udm Udmurt
1038
+ udu Uduk
1039
+ ukr Ukrainian
1040
+ ppk Uma
1041
+ ubu Umbu-Ungu
1042
+ urk Urak Lawoi’
1043
+ ura Urarina
1044
+ urt Urat
1045
+ urd-script_devanagari Urdu
1046
+ urd-script_arabic Urdu
1047
+ urd-script_latin Urdu
1048
+ upv Uripiv-Wala-Rano-Atchin
1049
+ usp Uspanteko
1050
+ uig-script_arabic Uyghur
1051
+ uig-script_cyrillic Uyghur
1052
+ uzb-script_cyrillic Uzbek
1053
+ vag Vagla
1054
+ bav Vengo
1055
+ vid Vidunda
1056
+ vif Vili
1057
+ vun Vunjo
1058
+ vut Vute
1059
+ prk Wa, Parauk
1060
+ wwa Waama
1061
+ rro Waima
1062
+ bao Waimaha
1063
+ waw Waiwai
1064
+ lgl Wala
1065
+ wlx Wali
1066
+ cou Wamey
1067
+ hub Wampís
1068
+ gvc Wanano
1069
+ mfi Wandala
1070
+ wap Wapishana
1071
+ wba Warao
1072
+ war Waray-Waray
1073
+ way Wayana
1074
+ guc Wayuu
1075
+ cym Welsh
1076
+ kvw Wersing
1077
+ tnp Whitesands
1078
+ hto Witoto, Minika
1079
+ huu Witoto, Murui
1080
+ wal-script_latin Wolaytta
1081
+ wal-script_ethiopic Wolaytta
1082
+ wlo Wolio
1083
+ noa Woun Meu
1084
+ wob Wè Northern
1085
+ kao Xaasongaxango
1086
+ xer Xerénte
1087
+ yad Yagua
1088
+ yka Yakan
1089
+ sah Yakut
1090
+ yba Yala
1091
+ yli Yali, Angguruk
1092
+ nlk Yali, Ninia
1093
+ yal Yalunka
1094
+ yam Yamba
1095
+ yat Yambeta
1096
+ jmd Yamdena
1097
+ tao Yami
1098
+ yaa Yaminahua
1099
+ ame Yanesha’
1100
+ guu Yanomamö
1101
+ yao Yao
1102
+ yre Yaouré
1103
+ yva Yawa
1104
+ ybb Yemba
1105
+ pib Yine
1106
+ byr Yipma
1107
+ pil Yom
1108
+ yor Yoruba
1109
+ ycn Yucuna
1110
+ ess Yupik, Saint Lawrence Island
1111
+ yuz Yuracare
1112
+ atb Zaiwa
1113
+ zne Zande
1114
+ zaq Zapotec, Aloápam
1115
+ zpo Zapotec, Amatlán
1116
+ zad Zapotec, Cajonos
1117
+ zpc Zapotec, Choapan
1118
+ zca Zapotec, Coatecas Altas
1119
+ zpg Zapotec, Guevea de Humboldt
1120
+ zai Zapotec, Isthmus
1121
+ zpl Zapotec, Lachixío
1122
+ zam Zapotec, Miahuatlán
1123
+ zaw Zapotec, Mitla
1124
+ zpm Zapotec, Mixtepec
1125
+ zac Zapotec, Ocotlán
1126
+ zao Zapotec, Ozolotepec
1127
+ ztq Zapotec, Quioquitani-Quierí
1128
+ zar Zapotec, Rincón
1129
+ zpt Zapotec, San Vicente Coatlán
1130
+ zpi Zapotec, Santa María Quiegolani
1131
+ zas Zapotec, Santo Domingo Albarradas
1132
+ zaa Zapotec, Sierra de Juárez
1133
+ zpz Zapotec, Texmelucan
1134
+ zab Zapotec, Western Tlacolula Valley
1135
+ zpu Zapotec, Yalálag
1136
+ zae Zapotec, Yareni
1137
+ zty Zapotec, Yatee
1138
+ zav Zapotec, Yatzachi
1139
+ zza Zaza
1140
+ zyb Zhuang, Yongbei
1141
+ ziw Zigula
1142
+ zos Zoque, Francisco León
1143
+ gnd Zulgo-Gemzek
1144
+ ewe Éwé
mm_num2word.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file is adapted from https://github.com/hpbyte/Myanmar_Number_to_Words
3
+ """
4
+ import re
5
+
6
+ mm_digit = {
7
+ "၀": "သုည",
8
+ "၁": "တစ်",
9
+ "၂": "နှစ်",
10
+ "၃": "သုံ:",
11
+ "၄": "လေ:",
12
+ "၅": "ငါ:",
13
+ "၆": "ခြောက်",
14
+ "၇": "ခုနှစ်",
15
+ "၈": "ရှစ်",
16
+ "၉": "ကို:",
17
+ }
18
+
19
+ # regular expressions
20
+ rgxPh = "^(၀၁|၀၉)"
21
+ rgxDate = "[၀-၉]{1,2}-[၀-၉]{1,2}-[၀-၉]{4}|[၀-၉]{1,2}\/[၀-၉]{1,2}\/[၀-၉]{4}"
22
+ rgxTime = "[၀-၉]{1,2}:[၀-၉]{1,2}"
23
+ rgxDec = "[၀-၉]*\.[၀-၉]*"
24
+ rgxAmt = "[,၀-၉]+"
25
+
26
+
27
+ def convert_digit(num):
28
+ """
29
+ @type num str
30
+ @param num Myanmar number
31
+ @rtype str
32
+ @return converted Myanmar spoken words
33
+ """
34
+
35
+ converted = ""
36
+ nb_digits = len(num)
37
+
38
+ def check_if_zero(pos):
39
+ return not num[-pos] == "၀"
40
+
41
+ def hundred_thousandth_val():
42
+ n = num[:-5]
43
+ return (
44
+ ("သိန်: " + mm_num2word(n))
45
+ if (n[-2:] == "၀၀")
46
+ else (mm_num2word(n) + "သိန်: ")
47
+ )
48
+
49
+ def thousandth_val():
50
+ return mm_digit[num[-4]] + ("ထောင် " if (num[-3:] == "၀၀၀") else "ထောင့် ")
51
+
52
+ def hundredth_val():
53
+ return mm_digit[num[-3]] + (
54
+ "ရာ့ "
55
+ if (
56
+ (num[-2] == "၀" and re.match(r"[၁-၉]", num[-1]))
57
+ or (re.match(r"[၁-၉]", num[-2]) and num[-1] == "၀")
58
+ )
59
+ else "ရာ "
60
+ )
61
+
62
+ def tenth_val():
63
+ return ("" if (num[-2] == "၁") else mm_digit[num[-2]]) + (
64
+ "ဆယ် " if (num[-1] == "၀") else "ဆယ့် "
65
+ )
66
+
67
+ if nb_digits > 5:
68
+ converted += hundred_thousandth_val()
69
+ if (nb_digits > 4) and check_if_zero(5):
70
+ converted += mm_digit[num[-5]] + "သောင်: "
71
+ if (nb_digits > 3) and check_if_zero(4):
72
+ converted += thousandth_val()
73
+ if (nb_digits > 2) and check_if_zero(3):
74
+ converted += hundredth_val()
75
+ if (nb_digits > 1) and check_if_zero(2):
76
+ converted += tenth_val()
77
+ if (nb_digits > 0) and check_if_zero(1):
78
+ converted += mm_digit[num[-1]]
79
+
80
+ return converted
81
+
82
+
83
+ def mm_num2word(num):
84
+ """
85
+ Detect type of number and convert accordingly
86
+
87
+ @type num str
88
+ @param num Myanmar number
89
+ @rtype str
90
+ @return converted Myanmar spoken words
91
+ """
92
+
93
+ word = ""
94
+
95
+ # phone number
96
+ if re.match(r"" + rgxPh, num[:2]):
97
+ word = " ".join([(mm_digit[d] if not d == "၇" else "ခွန်") for d in num])
98
+ # date
99
+ elif re.match(r"" + rgxDate, num):
100
+ n = re.split(r"-|/", num)
101
+ word = (
102
+ convert_digit(n[-1])
103
+ + " ခုနှစ် "
104
+ + convert_digit(n[1])
105
+ + " လပိုင်: "
106
+ + convert_digit(n[0])
107
+ + " ရက်"
108
+ )
109
+ # time
110
+ elif re.match(r"" + rgxTime, num):
111
+ n = re.split(r":", num)
112
+ word = (convert_digit(n[0]) + " နာရီ ") + (
113
+ "ခွဲ" if (n[1] == "၃၀") else (convert_digit(n[1]) + " မိနစ်")
114
+ )
115
+ # decimal
116
+ elif re.match(r"" + rgxDec, num):
117
+ n = re.split(r"\.", num)
118
+ word = convert_digit(n[0]) + " ဒဿမ " + " ".join([mm_digit[d] for d in n[1]])
119
+ # amount
120
+ elif re.match(r"" + rgxAmt, num):
121
+ word = convert_digit(num.replace(",", ""))
122
+ # default
123
+ else:
124
+ raise Exception("Cannot convert the provided number format!")
125
+
126
+ return word
127
+
128
+
129
+ def extract_num(S):
130
+ """
131
+ Extract numbers from the input string
132
+
133
+ @type S str
134
+ @param S Myanmar sentence
135
+ @rtype list
136
+ @return a list of Myanmar numbers
137
+ """
138
+ matchedNums = re.compile(
139
+ "%s|%s|%s|%s" % (rgxDate, rgxTime, rgxDec, rgxAmt)
140
+ ).findall(S)
141
+
142
+ return matchedNums
num2words_lang_map.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eng": ["en", "English, default"],
3
+ "amh": ["am", "Amharic"],
4
+ "ara": ["ar", "Arabic"],
5
+ "deu": ["de", "German"],
6
+ "spa": ["es", "Spanish"],
7
+ "fas": ["fa", "Farsi"],
8
+ "fin": ["fi", "Finnish"],
9
+ "fra": ["fr", "French"],
10
+ "heb": ["he", "Hebrew"],
11
+ "hun": ["hu", "Hungarian"],
12
+ "ind": ["id", "Indonesian"],
13
+ "isl": ["is", "Icelandic"],
14
+ "kan": ["kn", "Kannada"],
15
+ "kor": ["ko", "Korean"],
16
+ "kaz": ["kz", "Kazakh"],
17
+ "lav": ["lv", "Latvian"],
18
+ "pol": ["pl", "Polish"],
19
+ "swe": ["sv", "Swedish"],
20
+ "ron": ["ro", "Romanian"],
21
+ "rus": ["ru", "Russian"],
22
+ "tel": ["te", "Telugu"],
23
+ "tgk": ["tg", "Tajik"],
24
+ "tur": ["tr", "Turkish"],
25
+ "tha": ["th", "Thai"],
26
+ "vie": ["vi", "Vietnamese"],
27
+ "nld": ["nl", "Dutch"],
28
+ "ukr": ["uk", "Ukrainian"]
29
+ }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ttsmms
2
+ underthesea
3
+ nltk
4
+ soundfile
5
+ num2words
test.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ lang_codes = OrderedDict()
4
+
5
+ with open("lang_code.txt", "r") as file:
6
+ for line in file:
7
+ line = line.strip()
8
+ if line.startswith("----"):
9
+ continue
10
+ iso, lang = line.split("\t", 1)
11
+ lang_codes[lang + "(" + iso + ")"] = iso
12
+
13
+ # print(lang_codes)
14
+ print(len(lang_codes))
15
+
16
+ language_names = list(lang_codes.keys())
17
+
18
+ print(language_names)
19
+ print(lang_codes[language_names[0]])