Raghavan1988 commited on
Commit
f86940b
1 Parent(s): 579282f

adding language list from seamless_m4t

Browse files
Files changed (2) hide show
  1. app.py +9 -0
  2. lang_list.py +402 -0
app.py CHANGED
@@ -9,6 +9,15 @@ import json
9
  from decouple import Config
10
  from transformers import AutoProcessor, SeamlessM4TModel
11
 
 
 
 
 
 
 
 
 
 
12
  config = Config('.env')
13
 
14
  DEFAULT_TARGET_LANGUAGE = "English"
 
9
  from decouple import Config
10
  from transformers import AutoProcessor, SeamlessM4TModel
11
 
12
+ from lang_list import (
13
+ LANGUAGE_NAME_TO_CODE,
14
+ S2ST_TARGET_LANGUAGE_NAMES,
15
+ S2TT_TARGET_LANGUAGE_NAMES,
16
+ T2TT_TARGET_LANGUAGE_NAMES,
17
+ TEXT_SOURCE_LANGUAGE_NAMES,
18
+ LANG_TO_SPKR_ID,
19
+ )
20
+
21
  config = Config('.env')
22
 
23
  DEFAULT_TARGET_LANGUAGE = "English"
lang_list.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Language dict
2
+ language_code_to_name = {
3
+ "afr": "Afrikaans",
4
+ "amh": "Amharic",
5
+ "arb": "Modern Standard Arabic",
6
+ "ary": "Moroccan Arabic",
7
+ "arz": "Egyptian Arabic",
8
+ "asm": "Assamese",
9
+ "ast": "Asturian",
10
+ "azj": "North Azerbaijani",
11
+ "bel": "Belarusian",
12
+ "ben": "Bengali",
13
+ "bos": "Bosnian",
14
+ "bul": "Bulgarian",
15
+ "cat": "Catalan",
16
+ "ceb": "Cebuano",
17
+ "ces": "Czech",
18
+ "ckb": "Central Kurdish",
19
+ "cmn": "Mandarin Chinese",
20
+ "cym": "Welsh",
21
+ "dan": "Danish",
22
+ "deu": "German",
23
+ "ell": "Greek",
24
+ "eng": "English",
25
+ "est": "Estonian",
26
+ "eus": "Basque",
27
+ "fin": "Finnish",
28
+ "fra": "French",
29
+ "gaz": "West Central Oromo",
30
+ "gle": "Irish",
31
+ "glg": "Galician",
32
+ "guj": "Gujarati",
33
+ "heb": "Hebrew",
34
+ "hin": "Hindi",
35
+ "hrv": "Croatian",
36
+ "hun": "Hungarian",
37
+ "hye": "Armenian",
38
+ "ibo": "Igbo",
39
+ "ind": "Indonesian",
40
+ "isl": "Icelandic",
41
+ "ita": "Italian",
42
+ "jav": "Javanese",
43
+ "jpn": "Japanese",
44
+ "kam": "Kamba",
45
+ "kan": "Kannada",
46
+ "kat": "Georgian",
47
+ "kaz": "Kazakh",
48
+ "kea": "Kabuverdianu",
49
+ "khk": "Halh Mongolian",
50
+ "khm": "Khmer",
51
+ "kir": "Kyrgyz",
52
+ "kor": "Korean",
53
+ "lao": "Lao",
54
+ "lit": "Lithuanian",
55
+ "ltz": "Luxembourgish",
56
+ "lug": "Ganda",
57
+ "luo": "Luo",
58
+ "lvs": "Standard Latvian",
59
+ "mai": "Maithili",
60
+ "mal": "Malayalam",
61
+ "mar": "Marathi",
62
+ "mkd": "Macedonian",
63
+ "mlt": "Maltese",
64
+ "mni": "Meitei",
65
+ "mya": "Burmese",
66
+ "nld": "Dutch",
67
+ "nno": "Norwegian Nynorsk",
68
+ "nob": "Norwegian Bokm\u00e5l",
69
+ "npi": "Nepali",
70
+ "nya": "Nyanja",
71
+ "oci": "Occitan",
72
+ "ory": "Odia",
73
+ "pan": "Punjabi",
74
+ "pbt": "Southern Pashto",
75
+ "pes": "Western Persian",
76
+ "pol": "Polish",
77
+ "por": "Portuguese",
78
+ "ron": "Romanian",
79
+ "rus": "Russian",
80
+ "slk": "Slovak",
81
+ "slv": "Slovenian",
82
+ "sna": "Shona",
83
+ "snd": "Sindhi",
84
+ "som": "Somali",
85
+ "spa": "Spanish",
86
+ "srp": "Serbian",
87
+ "swe": "Swedish",
88
+ "swh": "Swahili",
89
+ "tam": "Tamil",
90
+ "tel": "Telugu",
91
+ "tgk": "Tajik",
92
+ "tgl": "Tagalog",
93
+ "tha": "Thai",
94
+ "tur": "Turkish",
95
+ "ukr": "Ukrainian",
96
+ "urd": "Urdu",
97
+ "uzn": "Northern Uzbek",
98
+ "vie": "Vietnamese",
99
+ "xho": "Xhosa",
100
+ "yor": "Yoruba",
101
+ "yue": "Cantonese",
102
+ "zlm": "Colloquial Malay",
103
+ "zsm": "Standard Malay",
104
+ "zul": "Zulu",
105
+ }
106
+ LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
107
+
108
+ # Source langs: S2ST / S2TT / ASR don't need source lang
109
+ # T2TT / T2ST use this
110
+ text_source_language_codes = [
111
+ "afr",
112
+ "amh",
113
+ "arb",
114
+ "ary",
115
+ "arz",
116
+ "asm",
117
+ "azj",
118
+ "bel",
119
+ "ben",
120
+ "bos",
121
+ "bul",
122
+ "cat",
123
+ "ceb",
124
+ "ces",
125
+ "ckb",
126
+ "cmn",
127
+ "cym",
128
+ "dan",
129
+ "deu",
130
+ "ell",
131
+ "eng",
132
+ "est",
133
+ "eus",
134
+ "fin",
135
+ "fra",
136
+ "gaz",
137
+ "gle",
138
+ "glg",
139
+ "guj",
140
+ "heb",
141
+ "hin",
142
+ "hrv",
143
+ "hun",
144
+ "hye",
145
+ "ibo",
146
+ "ind",
147
+ "isl",
148
+ "ita",
149
+ "jav",
150
+ "jpn",
151
+ "kan",
152
+ "kat",
153
+ "kaz",
154
+ "khk",
155
+ "khm",
156
+ "kir",
157
+ "kor",
158
+ "lao",
159
+ "lit",
160
+ "lug",
161
+ "luo",
162
+ "lvs",
163
+ "mai",
164
+ "mal",
165
+ "mar",
166
+ "mkd",
167
+ "mlt",
168
+ "mni",
169
+ "mya",
170
+ "nld",
171
+ "nno",
172
+ "nob",
173
+ "npi",
174
+ "nya",
175
+ "ory",
176
+ "pan",
177
+ "pbt",
178
+ "pes",
179
+ "pol",
180
+ "por",
181
+ "ron",
182
+ "rus",
183
+ "slk",
184
+ "slv",
185
+ "sna",
186
+ "snd",
187
+ "som",
188
+ "spa",
189
+ "srp",
190
+ "swe",
191
+ "swh",
192
+ "tam",
193
+ "tel",
194
+ "tgk",
195
+ "tgl",
196
+ "tha",
197
+ "tur",
198
+ "ukr",
199
+ "urd",
200
+ "uzn",
201
+ "vie",
202
+ "yor",
203
+ "yue",
204
+ "zsm",
205
+ "zul",
206
+ ]
207
+ TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
208
+
209
+ # Target langs:
210
+ # S2ST / T2ST
211
+ s2st_target_language_codes = [
212
+ "eng",
213
+ "arb",
214
+ "ben",
215
+ "cat",
216
+ "ces",
217
+ "cmn",
218
+ "cym",
219
+ "dan",
220
+ "deu",
221
+ "est",
222
+ "fin",
223
+ "fra",
224
+ "hin",
225
+ "ind",
226
+ "ita",
227
+ "jpn",
228
+ "kor",
229
+ "mlt",
230
+ "nld",
231
+ "pes",
232
+ "pol",
233
+ "por",
234
+ "ron",
235
+ "rus",
236
+ "slk",
237
+ "spa",
238
+ "swe",
239
+ "swh",
240
+ "tel",
241
+ "tgl",
242
+ "tha",
243
+ "tur",
244
+ "ukr",
245
+ "urd",
246
+ "uzn",
247
+ "vie",
248
+ ]
249
+ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
250
+
251
+ # S2TT / ASR
252
+ S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
253
+ # T2TT
254
+ T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
255
+
256
+
257
+ LANG_TO_SPKR_ID = {
258
+ "arb": [
259
+ 0
260
+ ],
261
+ "ben": [
262
+ 2,
263
+ 1
264
+ ],
265
+ "cat": [
266
+ 3
267
+ ],
268
+ "ces": [
269
+ 4
270
+ ],
271
+ "cmn": [
272
+ 5
273
+ ],
274
+ "cym": [
275
+ 6
276
+ ],
277
+ "dan": [
278
+ 7,
279
+ 8
280
+ ],
281
+ "deu": [
282
+ 9
283
+ ],
284
+ "eng": [
285
+ 10
286
+ ],
287
+ "est": [
288
+ 11,
289
+ 12,
290
+ 13
291
+ ],
292
+ "fin": [
293
+ 14
294
+ ],
295
+ "fra": [
296
+ 15
297
+ ],
298
+ "hin": [
299
+ 16
300
+ ],
301
+ "ind": [
302
+ 17,
303
+ 24,
304
+ 18,
305
+ 20,
306
+ 19,
307
+ 21,
308
+ 23,
309
+ 27,
310
+ 26,
311
+ 22,
312
+ 25
313
+ ],
314
+ "ita": [
315
+ 29,
316
+ 28
317
+ ],
318
+ "jpn": [
319
+ 30
320
+ ],
321
+ "kor": [
322
+ 31
323
+ ],
324
+ "mlt": [
325
+ 32,
326
+ 33,
327
+ 34
328
+ ],
329
+ "nld": [
330
+ 35
331
+ ],
332
+ "pes": [
333
+ 36
334
+ ],
335
+ "pol": [
336
+ 37
337
+ ],
338
+ "por": [
339
+ 38
340
+ ],
341
+ "ron": [
342
+ 39
343
+ ],
344
+ "rus": [
345
+ 40
346
+ ],
347
+ "slk": [
348
+ 41
349
+ ],
350
+ "spa": [
351
+ 42
352
+ ],
353
+ "swe": [
354
+ 43,
355
+ 45,
356
+ 44
357
+ ],
358
+ "swh": [
359
+ 46,
360
+ 48,
361
+ 47
362
+ ],
363
+ "tel": [
364
+ 49
365
+ ],
366
+ "tgl": [
367
+ 50
368
+ ],
369
+ "tha": [
370
+ 51,
371
+ 54,
372
+ 55,
373
+ 52,
374
+ 53
375
+ ],
376
+ "tur": [
377
+ 58,
378
+ 57,
379
+ 56
380
+ ],
381
+ "ukr": [
382
+ 59
383
+ ],
384
+ "urd": [
385
+ 60,
386
+ 61,
387
+ 62
388
+ ],
389
+ "uzn": [
390
+ 63,
391
+ 64,
392
+ 65
393
+ ],
394
+ "vie": [
395
+ 66,
396
+ 67,
397
+ 70,
398
+ 71,
399
+ 68,
400
+ 69
401
+ ]
402
+ }