TheComputerMan commited on
Commit
385b498
1 Parent(s): 525a4f7

Upload ArticulatoryCombinedTextFrontend.py

Browse files
Files changed (1) hide show
  1. ArticulatoryCombinedTextFrontend.py +323 -0
ArticulatoryCombinedTextFrontend.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sys
3
+
4
+ import panphon
5
+ import phonemizer
6
+ import torch
7
+
8
+ from Preprocessing.papercup_features import generate_feature_table
9
+
10
+
11
+ class ArticulatoryCombinedTextFrontend:
12
+
13
+ def __init__(self,
14
+ language,
15
+ use_word_boundaries=False, # goes together well with
16
+ # parallel models and an aligner. Doesn't go together
17
+ # well with autoregressive models.
18
+ use_explicit_eos=True,
19
+ use_prosody=False, # unfortunately the non-segmental
20
+ # nature of prosodic markers mixed with the sequential
21
+ # phonemes hurts the performance of end-to-end models a
22
+ # lot, even though one might think enriching the input
23
+ # with such information would help.
24
+ use_lexical_stress=False,
25
+ silent=True,
26
+ allow_unknown=False,
27
+ add_silence_to_end=True,
28
+ strip_silence=True):
29
+ """
30
+ Mostly preparing ID lookups
31
+ """
32
+ self.strip_silence = strip_silence
33
+ self.use_word_boundaries = use_word_boundaries
34
+ self.allow_unknown = allow_unknown
35
+ self.use_explicit_eos = use_explicit_eos
36
+ self.use_prosody = use_prosody
37
+ self.use_stress = use_lexical_stress
38
+ self.add_silence_to_end = add_silence_to_end
39
+ self.feature_table = panphon.FeatureTable()
40
+
41
+ if language == "en":
42
+ self.g2p_lang = "en-us"
43
+ self.expand_abbreviations = english_text_expansion
44
+ if not silent:
45
+ print("Created an English Text-Frontend")
46
+
47
+ elif language == "de":
48
+ self.g2p_lang = "de"
49
+ self.expand_abbreviations = lambda x: x
50
+ if not silent:
51
+ print("Created a German Text-Frontend")
52
+
53
+ elif language == "el":
54
+ self.g2p_lang = "el"
55
+ self.expand_abbreviations = lambda x: x
56
+ if not silent:
57
+ print("Created a Greek Text-Frontend")
58
+
59
+ elif language == "es":
60
+ self.g2p_lang = "es"
61
+ self.expand_abbreviations = lambda x: x
62
+ if not silent:
63
+ print("Created a Spanish Text-Frontend")
64
+
65
+ elif language == "fi":
66
+ self.g2p_lang = "fi"
67
+ self.expand_abbreviations = lambda x: x
68
+ if not silent:
69
+ print("Created a Finnish Text-Frontend")
70
+
71
+ elif language == "ru":
72
+ self.g2p_lang = "ru"
73
+ self.expand_abbreviations = lambda x: x
74
+ if not silent:
75
+ print("Created a Russian Text-Frontend")
76
+
77
+ elif language == "hu":
78
+ self.g2p_lang = "hu"
79
+ self.expand_abbreviations = lambda x: x
80
+ if not silent:
81
+ print("Created a Hungarian Text-Frontend")
82
+
83
+ elif language == "nl":
84
+ self.g2p_lang = "nl"
85
+ self.expand_abbreviations = lambda x: x
86
+ if not silent:
87
+ print("Created a Dutch Text-Frontend")
88
+
89
+ elif language == "fr":
90
+ self.g2p_lang = "fr-fr"
91
+ self.expand_abbreviations = lambda x: x
92
+ if not silent:
93
+ print("Created a French Text-Frontend")
94
+
95
+ elif language == "it":
96
+ self.g2p_lang = "it"
97
+ self.expand_abbreviations = lambda x: x
98
+ if not silent:
99
+ print("Created a Italian Text-Frontend")
100
+
101
+ elif language == "pt":
102
+ self.g2p_lang = "pt"
103
+ self.expand_abbreviations = lambda x: x
104
+ if not silent:
105
+ print("Created a Portuguese Text-Frontend")
106
+
107
+ elif language == "pl":
108
+ self.g2p_lang = "pl"
109
+ self.expand_abbreviations = lambda x: x
110
+ if not silent:
111
+ print("Created a Polish Text-Frontend")
112
+
113
+ # remember to also update get_language_id() when adding something here
114
+
115
+ else:
116
+ print("Language not supported yet")
117
+ sys.exit()
118
+
119
+ self.phone_to_vector_papercup = generate_feature_table()
120
+
121
+ self.phone_to_vector = dict()
122
+ for phone in self.phone_to_vector_papercup:
123
+ panphon_features = self.feature_table.word_to_vector_list(phone, numeric=True)
124
+ if panphon_features == []:
125
+ panphon_features = [[0] * 24]
126
+ papercup_features = self.phone_to_vector_papercup[phone]
127
+ self.phone_to_vector[phone] = papercup_features + panphon_features[0]
128
+
129
+ self.phone_to_id = { # this lookup must be updated manually, because the only
130
+ # other way would be extracting them from a set, which can be non-deterministic
131
+ '~': 0,
132
+ '#': 1,
133
+ '?': 2,
134
+ '!': 3,
135
+ '.': 4,
136
+ 'ɜ': 5,
137
+ 'ɫ': 6,
138
+ 'ə': 7,
139
+ 'ɚ': 8,
140
+ 'a': 9,
141
+ 'ð': 10,
142
+ 'ɛ': 11,
143
+ 'ɪ': 12,
144
+ 'ᵻ': 13,
145
+ 'ŋ': 14,
146
+ 'ɔ': 15,
147
+ 'ɒ': 16,
148
+ 'ɾ': 17,
149
+ 'ʃ': 18,
150
+ 'θ': 19,
151
+ 'ʊ': 20,
152
+ 'ʌ': 21,
153
+ 'ʒ': 22,
154
+ 'æ': 23,
155
+ 'b': 24,
156
+ 'ʔ': 25,
157
+ 'd': 26,
158
+ 'e': 27,
159
+ 'f': 28,
160
+ 'g': 29,
161
+ 'h': 30,
162
+ 'i': 31,
163
+ 'j': 32,
164
+ 'k': 33,
165
+ 'l': 34,
166
+ 'm': 35,
167
+ 'n': 36,
168
+ 'ɳ': 37,
169
+ 'o': 38,
170
+ 'p': 39,
171
+ 'ɡ': 40,
172
+ 'ɹ': 41,
173
+ 'r': 42,
174
+ 's': 43,
175
+ 't': 44,
176
+ 'u': 45,
177
+ 'v': 46,
178
+ 'w': 47,
179
+ 'x': 48,
180
+ 'z': 49,
181
+ 'ʀ': 50,
182
+ 'ø': 51,
183
+ 'ç': 52,
184
+ 'ɐ': 53,
185
+ 'œ': 54,
186
+ 'y': 55,
187
+ 'ʏ': 56,
188
+ 'ɑ': 57,
189
+ 'c': 58,
190
+ 'ɲ': 59,
191
+ 'ɣ': 60,
192
+ 'ʎ': 61,
193
+ 'β': 62,
194
+ 'ʝ': 63,
195
+ 'ɟ': 64,
196
+ 'q': 65,
197
+ 'ɕ': 66,
198
+ 'ʲ': 67,
199
+ 'ɭ': 68,
200
+ 'ɵ': 69,
201
+ 'ʑ': 70,
202
+ 'ʋ': 71,
203
+ 'ʁ': 72,
204
+ 'ɨ': 73,
205
+ 'ʂ': 74,
206
+ 'ɬ': 75,
207
+ } # for the states of the ctc loss and dijkstra/mas in the aligner
208
+
209
+ self.id_to_phone = {v: k for k, v in self.phone_to_id.items()}
210
+
211
+ def string_to_tensor(self, text, view=False, device="cpu", handle_missing=True, input_phonemes=False):
212
+ """
213
+ Fixes unicode errors, expands some abbreviations,
214
+ turns graphemes into phonemes and then vectorizes
215
+ the sequence as articulatory features
216
+ """
217
+ if input_phonemes:
218
+ phones = text
219
+ else:
220
+ phones = self.get_phone_string(text=text, include_eos_symbol=True)
221
+ if view:
222
+ print("Phonemes: \n{}\n".format(phones))
223
+ phones_vector = list()
224
+ # turn into numeric vectors
225
+ for char in phones:
226
+ if handle_missing:
227
+ try:
228
+ phones_vector.append(self.phone_to_vector[char])
229
+ except KeyError:
230
+ print("unknown phoneme: {}".format(char))
231
+ else:
232
+ phones_vector.append(self.phone_to_vector[char]) # leave error handling to elsewhere
233
+
234
+ return torch.Tensor(phones_vector, device=device)
235
+
236
+ def get_phone_string(self, text, include_eos_symbol=True):
237
+ # expand abbreviations
238
+ utt = self.expand_abbreviations(text)
239
+ # phonemize
240
+ phones = phonemizer.phonemize(utt,
241
+ language_switch='remove-flags',
242
+ backend="espeak",
243
+ language=self.g2p_lang,
244
+ preserve_punctuation=True,
245
+ strip=True,
246
+ punctuation_marks=';:,.!?¡¿—…"«»“”~/',
247
+ with_stress=self.use_stress).replace(";", ",").replace("/", " ").replace("—", "") \
248
+ .replace(":", ",").replace('"', ",").replace("-", ",").replace("...", ",").replace("-", ",").replace("\n", " ") \
249
+ .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~").replace(" ̃", "").replace('̩', "").replace("̃", "").replace("̪", "")
250
+ # less than 1 wide characters hidden here
251
+ phones = re.sub("~+", "~", phones)
252
+ if not self.use_prosody:
253
+ # retain ~ as heuristic pause marker, even though all other symbols are removed with this option.
254
+ # also retain . ? and ! since they can be indicators for the stop token
255
+ phones = phones.replace("ˌ", "").replace("ː", "").replace("ˑ", "") \
256
+ .replace("˘", "").replace("|", "").replace("‖", "")
257
+ if not self.use_word_boundaries:
258
+ phones = phones.replace(" ", "")
259
+ else:
260
+ phones = re.sub(r"\s+", " ", phones)
261
+ phones = re.sub(" ", "~", phones)
262
+ if self.strip_silence:
263
+ phones = phones.lstrip("~").rstrip("~")
264
+ if self.add_silence_to_end:
265
+ phones += "~" # adding a silence in the end during add_silence_to_end produces more natural sounding prosody
266
+ if include_eos_symbol:
267
+ phones += "#"
268
+
269
+ phones = "~" + phones
270
+ phones = re.sub("~+", "~", phones)
271
+
272
+ return phones
273
+
274
+
275
+ def english_text_expansion(text):
276
+ """
277
+ Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech.
278
+ See https://github.com/keithito/tacotron/
279
+ Careful: Only apply to english datasets. Different languages need different cleaners.
280
+ """
281
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
282
+ [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
283
+ ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
284
+ ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
285
+ for regex, replacement in _abbreviations:
286
+ text = re.sub(regex, replacement, text)
287
+ return text
288
+
289
+
290
+ def get_language_id(language):
291
+ if language == "en":
292
+ return torch.LongTensor([12])
293
+ elif language == "de":
294
+ return torch.LongTensor([1])
295
+ elif language == "el":
296
+ return torch.LongTensor([2])
297
+ elif language == "es":
298
+ return torch.LongTensor([3])
299
+ elif language == "fi":
300
+ return torch.LongTensor([4])
301
+ elif language == "ru":
302
+ return torch.LongTensor([5])
303
+ elif language == "hu":
304
+ return torch.LongTensor([6])
305
+ elif language == "nl":
306
+ return torch.LongTensor([7])
307
+ elif language == "fr":
308
+ return torch.LongTensor([8])
309
+ elif language == "pt":
310
+ return torch.LongTensor([9])
311
+ elif language == "pl":
312
+ return torch.LongTensor([10])
313
+ elif language == "it":
314
+ return torch.LongTensor([11])
315
+
316
+
317
+ if __name__ == '__main__':
318
+ # test an English utterance
319
+ tfr_en = ArticulatoryCombinedTextFrontend(language="en")
320
+ print(tfr_en.string_to_tensor("This is a complex sentence, it even has a pause! But can it do this? Nice.", view=True))
321
+
322
+ tfr_en = ArticulatoryCombinedTextFrontend(language="de")
323
+ print(tfr_en.string_to_tensor("Alles klar, jetzt testen wir einen deutschen Satz. Ich hoffe es gibt nicht mehr viele unspezifizierte Phoneme.", view=True))