SpeechCloning / Preprocessing /ArticulatoryCombinedTextFrontend.py
Florian Lux
implement the cloning demo
2cb106d
raw
history blame
11.7 kB
import re
import sys
import panphon
import phonemizer
import torch
from Preprocessing.papercup_features import generate_feature_table
class ArticulatoryCombinedTextFrontend:
def __init__(self,
language,
use_word_boundaries=False, # goes together well with
# parallel models and an aligner. Doesn't go together
# well with autoregressive models.
use_explicit_eos=True,
use_prosody=False, # unfortunately the non-segmental
# nature of prosodic markers mixed with the sequential
# phonemes hurts the performance of end-to-end models a
# lot, even though one might think enriching the input
# with such information would help.
use_lexical_stress=False,
silent=True,
allow_unknown=False,
add_silence_to_end=True,
strip_silence=True):
"""
Mostly preparing ID lookups
"""
self.strip_silence = strip_silence
self.use_word_boundaries = use_word_boundaries
self.allow_unknown = allow_unknown
self.use_explicit_eos = use_explicit_eos
self.use_prosody = use_prosody
self.use_stress = use_lexical_stress
self.add_silence_to_end = add_silence_to_end
self.feature_table = panphon.FeatureTable()
if language == "en":
self.g2p_lang = "en-us"
self.expand_abbreviations = english_text_expansion
if not silent:
print("Created an English Text-Frontend")
elif language == "de":
self.g2p_lang = "de"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a German Text-Frontend")
elif language == "el":
self.g2p_lang = "el"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a Greek Text-Frontend")
elif language == "es":
self.g2p_lang = "es"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a Spanish Text-Frontend")
elif language == "fi":
self.g2p_lang = "fi"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a Finnish Text-Frontend")
elif language == "ru":
self.g2p_lang = "ru"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a Russian Text-Frontend")
elif language == "hu":
self.g2p_lang = "hu"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a Hungarian Text-Frontend")
elif language == "nl":
self.g2p_lang = "nl"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a Dutch Text-Frontend")
elif language == "fr":
self.g2p_lang = "fr-fr"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a French Text-Frontend")
elif language == "it":
self.g2p_lang = "it"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a Italian Text-Frontend")
elif language == "pt":
self.g2p_lang = "pt"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a Portuguese Text-Frontend")
elif language == "pl":
self.g2p_lang = "pl"
self.expand_abbreviations = lambda x: x
if not silent:
print("Created a Polish Text-Frontend")
# remember to also update get_language_id() when adding something here
else:
print("Language not supported yet")
sys.exit()
self.phone_to_vector_papercup = generate_feature_table()
self.phone_to_vector = dict()
for phone in self.phone_to_vector_papercup:
panphon_features = self.feature_table.word_to_vector_list(phone, numeric=True)
if panphon_features == []:
panphon_features = [[0] * 24]
papercup_features = self.phone_to_vector_papercup[phone]
self.phone_to_vector[phone] = papercup_features + panphon_features[0]
self.phone_to_id = { # this lookup must be updated manually, because the only
# other way would be extracting them from a set, which can be non-deterministic
'~': 0,
'#': 1,
'?': 2,
'!': 3,
'.': 4,
'ɜ': 5,
'ɫ': 6,
'ə': 7,
'ɚ': 8,
'a': 9,
'ð': 10,
'ɛ': 11,
'ɪ': 12,
'ᵻ': 13,
'ŋ': 14,
'ɔ': 15,
'ɒ': 16,
'ɾ': 17,
'ʃ': 18,
'θ': 19,
'ʊ': 20,
'ʌ': 21,
'ʒ': 22,
'æ': 23,
'b': 24,
'ʔ': 25,
'd': 26,
'e': 27,
'f': 28,
'g': 29,
'h': 30,
'i': 31,
'j': 32,
'k': 33,
'l': 34,
'm': 35,
'n': 36,
'ɳ': 37,
'o': 38,
'p': 39,
'ɡ': 40,
'ɹ': 41,
'r': 42,
's': 43,
't': 44,
'u': 45,
'v': 46,
'w': 47,
'x': 48,
'z': 49,
'ʀ': 50,
'ø': 51,
'ç': 52,
'ɐ': 53,
'œ': 54,
'y': 55,
'ʏ': 56,
'ɑ': 57,
'c': 58,
'ɲ': 59,
'ɣ': 60,
'ʎ': 61,
'β': 62,
'ʝ': 63,
'ɟ': 64,
'q': 65,
'ɕ': 66,
'ʲ': 67,
'ɭ': 68,
'ɵ': 69,
'ʑ': 70,
'ʋ': 71,
'ʁ': 72,
'ɨ': 73,
'ʂ': 74,
'ɬ': 75,
} # for the states of the ctc loss and dijkstra/mas in the aligner
self.id_to_phone = {v: k for k, v in self.phone_to_id.items()}
def string_to_tensor(self, text, view=False, device="cpu", handle_missing=True, input_phonemes=False):
"""
Fixes unicode errors, expands some abbreviations,
turns graphemes into phonemes and then vectorizes
the sequence as articulatory features
"""
if input_phonemes:
phones = text
else:
phones = self.get_phone_string(text=text, include_eos_symbol=True)
if view:
print("Phonemes: \n{}\n".format(phones))
phones_vector = list()
# turn into numeric vectors
for char in phones:
if handle_missing:
try:
phones_vector.append(self.phone_to_vector[char])
except KeyError:
print("unknown phoneme: {}".format(char))
else:
phones_vector.append(self.phone_to_vector[char]) # leave error handling to elsewhere
return torch.Tensor(phones_vector, device=device)
def get_phone_string(self, text, include_eos_symbol=True):
# expand abbreviations
utt = self.expand_abbreviations(text)
# phonemize
phones = phonemizer.phonemize(utt,
language_switch='remove-flags',
backend="espeak",
language=self.g2p_lang,
preserve_punctuation=True,
strip=True,
punctuation_marks=';:,.!?¡¿—…"«»“”~/',
with_stress=self.use_stress).replace(";", ",").replace("/", " ").replace("—", "") \
.replace(":", ",").replace('"', ",").replace("-", ",").replace("...", ",").replace("-", ",").replace("\n", " ") \
.replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~").replace(" ̃", "").replace('̩', "").replace("̃", "").replace("̪", "")
# less than 1 wide characters hidden here
phones = re.sub("~+", "~", phones)
if not self.use_prosody:
# retain ~ as heuristic pause marker, even though all other symbols are removed with this option.
# also retain . ? and ! since they can be indicators for the stop token
phones = phones.replace("ˌ", "").replace("ː", "").replace("ˑ", "") \
.replace("˘", "").replace("|", "").replace("‖", "")
if not self.use_word_boundaries:
phones = phones.replace(" ", "")
else:
phones = re.sub(r"\s+", " ", phones)
phones = re.sub(" ", "~", phones)
if self.strip_silence:
phones = phones.lstrip("~").rstrip("~")
if self.add_silence_to_end:
phones += "~" # adding a silence in the end during add_silence_to_end produces more natural sounding prosody
if include_eos_symbol:
phones += "#"
phones = "~" + phones
phones = re.sub("~+", "~", phones)
return phones
def english_text_expansion(text):
"""
Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech.
See https://github.com/keithito/tacotron/
Careful: Only apply to english datasets. Different languages need different cleaners.
"""
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
[('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def get_language_id(language):
if language == "en":
return torch.LongTensor([0])
elif language == "de":
return torch.LongTensor([1])
elif language == "el":
return torch.LongTensor([2])
elif language == "es":
return torch.LongTensor([3])
elif language == "fi":
return torch.LongTensor([4])
elif language == "ru":
return torch.LongTensor([5])
elif language == "hu":
return torch.LongTensor([6])
elif language == "nl":
return torch.LongTensor([7])
elif language == "fr":
return torch.LongTensor([8])
elif language == "pt":
return torch.LongTensor([9])
elif language == "pl":
return torch.LongTensor([10])
elif language == "it":
return torch.LongTensor([11])
if __name__ == '__main__':
# test an English utterance
tfr_en = ArticulatoryCombinedTextFrontend(language="en")
print(tfr_en.string_to_tensor("This is a complex sentence, it even has a pause! But can it do this? Nice.", view=True))
tfr_en = ArticulatoryCombinedTextFrontend(language="de")
print(tfr_en.string_to_tensor("Alles klar, jetzt testen wir einen deutschen Satz. Ich hoffe es gibt nicht mehr viele unspezifizierte Phoneme.", view=True))