Flux9665 commited on
Commit
c07eff7
1 Parent(s): ba705d6

Update Preprocessing/TextFrontend.py

Browse files
Files changed (1) hide show
  1. Preprocessing/TextFrontend.py +0 -515
Preprocessing/TextFrontend.py CHANGED
@@ -74,510 +74,6 @@ class ArticulatoryCombinedTextFrontend:
74
  self.expand_abbreviations = english_text_expansion
75
  self.phonemizer = "espeak"
76
 
77
- elif language == "deu":
78
- self.g2p_lang = "de" # German
79
- self.expand_abbreviations = lambda x: x
80
- self.phonemizer = "espeak"
81
-
82
- elif language == "ell":
83
- self.g2p_lang = "el" # Greek
84
- self.expand_abbreviations = lambda x: x
85
- self.phonemizer = "espeak"
86
-
87
- elif language == "spa":
88
- self.g2p_lang = "es" # Spanish
89
- self.expand_abbreviations = lambda x: x
90
- self.phonemizer = "espeak"
91
-
92
- elif language == "spa-lat":
93
- self.g2p_lang = "es-419" # Spanish
94
- self.expand_abbreviations = lambda x: x
95
- self.phonemizer = "espeak"
96
-
97
- elif language == "fin":
98
- self.g2p_lang = "fi" # Finnish
99
- self.expand_abbreviations = lambda x: x
100
- self.phonemizer = "espeak"
101
-
102
- elif language == "rus":
103
- self.g2p_lang = "ru" # Russian
104
- self.expand_abbreviations = lambda x: x
105
- self.phonemizer = "espeak"
106
-
107
- elif language == "hun":
108
- self.g2p_lang = "hu" # Hungarian
109
- self.expand_abbreviations = lambda x: x
110
- self.phonemizer = "espeak"
111
-
112
- elif language == "nld":
113
- self.g2p_lang = "nl" # Dutch
114
- self.expand_abbreviations = lambda x: x
115
- self.phonemizer = "espeak"
116
-
117
- elif language == "fra":
118
- self.g2p_lang = "fr-fr" # French
119
- self.expand_abbreviations = remove_french_spacing
120
- self.phonemizer = "espeak"
121
-
122
- elif language == "fr-be":
123
- self.g2p_lang = "fr-be" # French
124
- self.expand_abbreviations = remove_french_spacing
125
- self.phonemizer = "espeak"
126
-
127
- elif language == "fr-sw":
128
- self.g2p_lang = "fr-ch" # French
129
- self.expand_abbreviations = remove_french_spacing
130
- self.phonemizer = "espeak"
131
-
132
- elif language == "ita":
133
- self.g2p_lang = "it" # Italian
134
- self.expand_abbreviations = lambda x: x
135
- self.phonemizer = "espeak"
136
-
137
- elif language == "por":
138
- self.g2p_lang = "pt" # Portuguese
139
- self.expand_abbreviations = lambda x: x
140
- self.phonemizer = "espeak"
141
-
142
- elif language == "pt-br":
143
- self.g2p_lang = "pt-br" # Portuguese
144
- self.expand_abbreviations = lambda x: x
145
- self.phonemizer = "espeak"
146
-
147
- elif language == "pol":
148
- self.g2p_lang = "pl" # Polish
149
- self.expand_abbreviations = lambda x: x
150
- self.phonemizer = "espeak"
151
-
152
- elif language == "cmn":
153
- self.g2p_lang = "cmn" # Mandarin
154
- self.expand_abbreviations = convert_kanji_to_pinyin_mandarin
155
- self.phonemizer = "dragonmapper"
156
-
157
- elif language == "vie":
158
- self.g2p_lang = "vi" # Northern Vietnamese
159
- self.expand_abbreviations = lambda x: x
160
- self.phonemizer = "espeak"
161
-
162
- elif language == "vi-ctr":
163
- self.g2p_lang = "vi-vn-x-central" # Central Vietnamese
164
- self.expand_abbreviations = lambda x: x
165
- self.phonemizer = "espeak"
166
-
167
- elif language == "vi-so":
168
- self.g2p_lang = "vi-vn-x-south" # Southern Vietnamese
169
- self.expand_abbreviations = lambda x: x
170
- self.phonemizer = "espeak"
171
-
172
- elif language == "ukr":
173
- self.g2p_lang = "uk" # Ukrainian
174
- self.expand_abbreviations = lambda x: x
175
- self.phonemizer = "espeak"
176
-
177
- elif language == "pes":
178
- self.g2p_lang = "fa" # Western Farsi
179
- self.expand_abbreviations = lambda x: x
180
- self.phonemizer = "espeak"
181
-
182
- elif language == "afr":
183
- self.g2p_lang = "af" # Afrikaans
184
- self.expand_abbreviations = lambda x: x
185
- self.phonemizer = "espeak"
186
-
187
- elif language == "aln":
188
- self.g2p_lang = "sq" # Albanian
189
- self.expand_abbreviations = lambda x: x
190
- self.phonemizer = "espeak"
191
-
192
- elif language == "amh":
193
- self.g2p_lang = "am" # Amharic
194
- self.expand_abbreviations = lambda x: x
195
- self.phonemizer = "espeak"
196
-
197
- elif language == "arb":
198
- self.g2p_lang = "ar" # Arabic
199
- self.expand_abbreviations = lambda x: x
200
- self.phonemizer = "espeak"
201
-
202
- elif language == "arg":
203
- self.g2p_lang = "an" # Aragonese
204
- self.expand_abbreviations = lambda x: x
205
- self.phonemizer = "espeak"
206
-
207
- elif language == "hye":
208
- self.g2p_lang = "hy" # East Armenian
209
- self.expand_abbreviations = lambda x: x
210
- self.phonemizer = "espeak"
211
-
212
- elif language == "hyw":
213
- self.g2p_lang = "hyw" # West Armenian
214
- self.expand_abbreviations = lambda x: x
215
- self.phonemizer = "espeak"
216
-
217
- elif language == "azj":
218
- self.g2p_lang = "az" # Azerbaijani
219
- self.expand_abbreviations = lambda x: x
220
- self.phonemizer = "espeak"
221
-
222
- elif language == "bak":
223
- self.g2p_lang = "ba" # Bashkir
224
- self.expand_abbreviations = lambda x: x
225
- self.phonemizer = "espeak"
226
-
227
- elif language == "eus":
228
- self.g2p_lang = "eu" # Basque
229
- self.expand_abbreviations = lambda x: x
230
- self.phonemizer = "espeak"
231
-
232
- elif language == "bel":
233
- self.g2p_lang = "be" # Belarusian
234
- self.expand_abbreviations = lambda x: x
235
- self.phonemizer = "espeak"
236
-
237
- elif language == "ben":
238
- self.g2p_lang = "bn" # Bengali
239
- self.expand_abbreviations = lambda x: x
240
- self.phonemizer = "espeak"
241
-
242
- elif language == "bpy":
243
- self.g2p_lang = "bpy" # Bishnupriya Manipuri
244
- self.expand_abbreviations = lambda x: x
245
- self.phonemizer = "espeak"
246
-
247
- elif language == "bos":
248
- self.g2p_lang = "bs" # Bosnian
249
- self.expand_abbreviations = lambda x: x
250
- self.phonemizer = "espeak"
251
-
252
- elif language == "bul":
253
- self.g2p_lang = "bg" # Bulgarian
254
- self.expand_abbreviations = lambda x: x
255
- self.phonemizer = "espeak"
256
-
257
- elif language == "mya":
258
- self.g2p_lang = "my" # Burmese
259
- self.expand_abbreviations = lambda x: x
260
- self.phonemizer = "espeak"
261
-
262
- elif language == "chr":
263
- self.g2p_lang = "chr" # Cherokee
264
- self.expand_abbreviations = lambda x: x
265
- self.phonemizer = "espeak"
266
-
267
- elif language == "yue":
268
- self.g2p_lang = "yue" # Chinese Cantonese
269
- self.expand_abbreviations = lambda x: x
270
- self.phonemizer = "espeak"
271
-
272
- elif language == "hak":
273
- self.g2p_lang = "hak" # Chinese Hakka
274
- self.expand_abbreviations = lambda x: x
275
- self.phonemizer = "espeak"
276
-
277
- elif language == "haw":
278
- self.g2p_lang = "haw" # Hawaiian
279
- self.expand_abbreviations = lambda x: x
280
- self.phonemizer = "espeak"
281
-
282
- elif language == "hrv":
283
- self.g2p_lang = "hr" # Croatian
284
- self.expand_abbreviations = lambda x: x
285
- self.phonemizer = "espeak"
286
-
287
- elif language == "ces":
288
- self.g2p_lang = "cs" # Czech
289
- self.expand_abbreviations = lambda x: x
290
- self.phonemizer = "espeak"
291
-
292
- elif language == "dan":
293
- self.g2p_lang = "da" # Danish
294
- self.expand_abbreviations = lambda x: x
295
- self.phonemizer = "espeak"
296
-
297
- elif language == "ekk":
298
- self.g2p_lang = "et" # Estonian
299
- self.expand_abbreviations = lambda x: x
300
- self.phonemizer = "espeak"
301
-
302
- elif language == "gle":
303
- self.g2p_lang = "ga" # Gaelic Irish
304
- self.expand_abbreviations = lambda x: x
305
- self.phonemizer = "espeak"
306
-
307
- elif language == "gla":
308
- self.g2p_lang = "gd" # Gaelic Scottish
309
- self.expand_abbreviations = lambda x: x
310
- self.phonemizer = "espeak"
311
-
312
- elif language == "en-sc":
313
- self.g2p_lang = "en-gb-scotland"
314
- self.expand_abbreviations = lambda x: x
315
- self.phonemizer = "espeak"
316
-
317
- elif language == "kat":
318
- self.g2p_lang = "ka" # Georgian
319
- self.expand_abbreviations = lambda x: x
320
- self.phonemizer = "espeak"
321
-
322
- elif language == "kal":
323
- self.g2p_lang = "kl" # Greenlandic
324
- self.expand_abbreviations = lambda x: x
325
- self.phonemizer = "espeak"
326
-
327
- elif language == "guj":
328
- self.g2p_lang = "gu" # Gujarati
329
- self.expand_abbreviations = lambda x: x
330
- self.phonemizer = "espeak"
331
-
332
- elif language == "heb":
333
- self.g2p_lang = "he" # Hebrew
334
- self.expand_abbreviations = lambda x: x
335
- self.phonemizer = "espeak"
336
-
337
- elif language == "hin":
338
- self.g2p_lang = "hi" # Hindi
339
- self.expand_abbreviations = lambda x: x
340
- self.phonemizer = "espeak"
341
-
342
- elif language == "isl":
343
- self.g2p_lang = "is" # Icelandic
344
- self.expand_abbreviations = lambda x: x
345
- self.phonemizer = "espeak"
346
-
347
- elif language == "ind":
348
- self.g2p_lang = "id" # Indonesian
349
- self.expand_abbreviations = lambda x: x
350
- self.phonemizer = "espeak"
351
-
352
- elif language == "jpn":
353
- import pykakasi
354
-
355
- self.kakasi = pykakasi.Kakasi() # this is not a satisfactory solution, but it is the best one I could come up with so far.
356
- self.expand_abbreviations = lambda x: " ".join([chunk["hepburn"] for chunk in self.kakasi.convert(x)])
357
- self.g2p_lang = language
358
- self.phonemizer = "transphone"
359
- self.transphone = read_g2p(device=device)
360
-
361
- elif language == "kan":
362
- self.g2p_lang = "kn" # Kannada
363
- self.expand_abbreviations = lambda x: x
364
- self.phonemizer = "espeak"
365
-
366
- elif language == "knn":
367
- self.g2p_lang = "kok" # Konkani
368
- self.expand_abbreviations = lambda x: x
369
- self.phonemizer = "espeak"
370
-
371
- elif language == "kor":
372
- self.g2p_lang = "ko" # Korean
373
- self.expand_abbreviations = lambda x: x
374
- self.phonemizer = "espeak"
375
-
376
- elif language == "ckb":
377
- self.g2p_lang = "ku" # Kurdish
378
- self.expand_abbreviations = lambda x: x
379
- self.phonemizer = "espeak"
380
-
381
- elif language == "kaz":
382
- self.g2p_lang = "kk" # Kazakh
383
- self.expand_abbreviations = lambda x: x
384
- self.phonemizer = "espeak"
385
-
386
- elif language == "kir":
387
- self.g2p_lang = "ky" # Kyrgyz
388
- self.expand_abbreviations = lambda x: x
389
- self.phonemizer = "espeak"
390
-
391
- elif language == "lat":
392
- self.g2p_lang = "la" # Latin
393
- self.expand_abbreviations = lambda x: x
394
- self.phonemizer = "espeak"
395
-
396
- elif language == "ltz":
397
- self.g2p_lang = "lb" # Luxembourgish
398
- self.expand_abbreviations = lambda x: x
399
- self.phonemizer = "espeak"
400
-
401
- elif language == "lvs":
402
- self.g2p_lang = "lv" # Latvian
403
- self.expand_abbreviations = lambda x: x
404
- self.phonemizer = "espeak"
405
-
406
- elif language == "lit":
407
- self.g2p_lang = "lt" # Lithuanian
408
- self.expand_abbreviations = lambda x: x
409
- self.phonemizer = "espeak"
410
-
411
- elif language == "mri":
412
- self.g2p_lang = "mi" # Māori
413
- self.expand_abbreviations = lambda x: x
414
- self.phonemizer = "espeak"
415
-
416
- elif language == "mkd":
417
- self.g2p_lang = "mk" # Macedonian
418
- self.expand_abbreviations = lambda x: x
419
- self.phonemizer = "espeak"
420
-
421
- elif language == "zlm":
422
- self.g2p_lang = "ms" # Malay
423
- self.expand_abbreviations = lambda x: x
424
- self.phonemizer = "espeak"
425
-
426
- elif language == "mal":
427
- self.g2p_lang = "ml" # Malayalam
428
- self.expand_abbreviations = lambda x: x
429
- self.phonemizer = "espeak"
430
-
431
- elif language == "mlt":
432
- self.g2p_lang = "mt" # Maltese
433
- self.expand_abbreviations = lambda x: x
434
- self.phonemizer = "espeak"
435
-
436
- elif language == "mar":
437
- self.g2p_lang = "mr" # Marathi
438
- self.expand_abbreviations = lambda x: x
439
- self.phonemizer = "espeak"
440
-
441
- elif language == "nci":
442
- self.g2p_lang = "nci" # Nahuatl
443
- self.expand_abbreviations = lambda x: x
444
- self.phonemizer = "espeak"
445
-
446
- elif language == "npi":
447
- self.g2p_lang = "ne" # Nepali
448
- self.expand_abbreviations = lambda x: x
449
- self.phonemizer = "espeak"
450
-
451
- elif language == "nob":
452
- self.g2p_lang = "nb" # Norwegian Bokmål
453
- self.expand_abbreviations = lambda x: x
454
- self.phonemizer = "espeak"
455
-
456
- elif language == "nog":
457
- self.g2p_lang = "nog" # Nogai
458
- self.expand_abbreviations = lambda x: x
459
- self.phonemizer = "espeak"
460
-
461
- elif language == "ory":
462
- self.g2p_lang = "or" # Oriya
463
- self.expand_abbreviations = lambda x: x
464
- self.phonemizer = "espeak"
465
-
466
- elif language == "gaz":
467
- self.g2p_lang = "om" # Oromo
468
- self.expand_abbreviations = lambda x: x
469
- self.phonemizer = "espeak"
470
-
471
- elif language == "pap":
472
- self.g2p_lang = "pap" # Papiamento
473
- self.expand_abbreviations = lambda x: x
474
- self.phonemizer = "espeak"
475
-
476
- elif language == "pan":
477
- self.g2p_lang = "pa" # Punjabi
478
- self.expand_abbreviations = lambda x: x
479
- self.phonemizer = "espeak"
480
-
481
- elif language == "ron":
482
- self.g2p_lang = "ro" # Romanian
483
- self.expand_abbreviations = lambda x: x
484
- self.phonemizer = "espeak"
485
-
486
- elif language == "lav":
487
- self.g2p_lang = "ru-lv" # Russian Latvia
488
- self.expand_abbreviations = lambda x: x
489
- self.phonemizer = "espeak"
490
-
491
- elif language == "srp":
492
- self.g2p_lang = "sr" # Serbian
493
- self.expand_abbreviations = lambda x: x
494
- self.phonemizer = "espeak"
495
-
496
- elif language == "tsn":
497
- self.g2p_lang = "tn" # Setswana
498
- self.expand_abbreviations = lambda x: x
499
- self.phonemizer = "espeak"
500
-
501
- elif language == "snd":
502
- self.g2p_lang = "sd" # Sindhi
503
- self.expand_abbreviations = lambda x: x
504
- self.phonemizer = "espeak"
505
-
506
- elif language == "slk":
507
- self.g2p_lang = "sk" # Slovak
508
- self.expand_abbreviations = lambda x: x
509
- self.phonemizer = "espeak"
510
-
511
- elif language == "slv":
512
- self.g2p_lang = "sl" # Slovenian
513
- self.expand_abbreviations = lambda x: x
514
- self.phonemizer = "espeak"
515
-
516
- elif language == "smj":
517
- self.g2p_lang = "smj" # Lule Saami
518
- self.expand_abbreviations = lambda x: x
519
- self.phonemizer = "espeak"
520
-
521
- elif language == "swh":
522
- self.g2p_lang = "sw" # Swahili
523
- self.expand_abbreviations = lambda x: x
524
- self.phonemizer = "espeak"
525
-
526
- elif language == "swe":
527
- self.g2p_lang = "sv" # Swedish
528
- self.expand_abbreviations = lambda x: x
529
- self.phonemizer = "espeak"
530
-
531
- elif language == "tam":
532
- self.g2p_lang = "ta" # Tamil
533
- self.expand_abbreviations = lambda x: x
534
- self.phonemizer = "espeak"
535
-
536
- elif language == "tha":
537
- self.g2p_lang = "th" # Thai
538
- self.expand_abbreviations = lambda x: x
539
- self.phonemizer = "espeak"
540
-
541
- elif language == "tuk":
542
- self.g2p_lang = "tk" # Turkmen
543
- self.expand_abbreviations = lambda x: x
544
- self.phonemizer = "espeak"
545
-
546
- elif language == "tat":
547
- self.g2p_lang = "tt" # Tatar
548
- self.expand_abbreviations = lambda x: x
549
- self.phonemizer = "espeak"
550
-
551
- elif language == "tel":
552
- self.g2p_lang = "te" # Telugu
553
- self.expand_abbreviations = lambda x: x
554
- self.phonemizer = "espeak"
555
-
556
- elif language == "tur":
557
- self.g2p_lang = "tr" # Turkish
558
- self.expand_abbreviations = lambda x: x
559
- self.phonemizer = "espeak"
560
-
561
- elif language == "uig":
562
- self.g2p_lang = "ug" # Uyghur
563
- self.expand_abbreviations = lambda x: x
564
- self.phonemizer = "espeak"
565
-
566
- elif language == "urd":
567
- self.g2p_lang = "ur" # Urdu
568
- self.expand_abbreviations = lambda x: x
569
- self.phonemizer = "espeak"
570
-
571
- elif language == "uzn":
572
- self.g2p_lang = "uz" # Uzbek
573
- self.expand_abbreviations = lambda x: x
574
- self.phonemizer = "espeak"
575
-
576
- elif language == "cym":
577
- self.g2p_lang = "cy" # Welsh
578
- self.expand_abbreviations = lambda x: x
579
- self.phonemizer = "espeak"
580
-
581
  else:
582
  # blanket solution for the rest
583
  print("Using Transphone. A specialized phonemizer might work better.")
@@ -1044,17 +540,6 @@ class ArticulatoryCombinedTextFrontend:
1044
 
1045
 
1046
  def english_text_expansion(text):
1047
- """
1048
- Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech.
1049
- See https://github.com/keithito/tacotron/
1050
- Careful: Only apply to english datasets. Different languages need different cleaners.
1051
- """
1052
- _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
1053
- [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
1054
- ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
1055
- ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
1056
- for regex, replacement in _abbreviations:
1057
- text = re.sub(regex, replacement, text)
1058
  return text
1059
 
1060
 
 
74
  self.expand_abbreviations = english_text_expansion
75
  self.phonemizer = "espeak"
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  else:
78
  # blanket solution for the rest
79
  print("Using Transphone. A specialized phonemizer might work better.")
 
540
 
541
 
542
  def english_text_expansion(text):
 
 
 
 
 
 
 
 
 
 
 
543
  return text
544
 
545