Yehor's picture
Init
ea6a7ed
"""adapted from https://github.com/keithito/tacotron"""
import re
import numpy as np
from .cleaners import Cleaner
from .symbols import get_symbols
from .grapheme_dictionary import Grapheme2PhonemeDictionary
#########
# REGEX #
#########
# Regular expression matching text enclosed in curly braces for encoding
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
# Regular expression matching words and not words
_words_re = re.compile(
r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]+|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)"
)
def lines_to_list(filename):
with open(filename, encoding="utf-8") as f:
lines = f.readlines()
lines = [l.rstrip() for l in lines]
return lines
class TextProcessing(object):
def __init__(
self,
symbol_set,
cleaner_name,
heteronyms_path,
phoneme_dict_path,
p_phoneme,
handle_phoneme,
handle_phoneme_ambiguous,
prepend_space_to_text=False,
append_space_to_text=False,
add_bos_eos_to_text=False,
encoding="latin-1",
):
if heteronyms_path is not None and heteronyms_path != "":
self.heteronyms = set(lines_to_list(heteronyms_path))
else:
self.heteronyms = []
# phoneme dict
self.phonemedict = {}
self.p_phoneme = p_phoneme
self.handle_phoneme = handle_phoneme
self.handle_phoneme_ambiguous = handle_phoneme_ambiguous
self.symbols = get_symbols(symbol_set)
self.cleaner_names = cleaner_name
self.cleaner = Cleaner(cleaner_name, self.phonemedict)
self.prepend_space_to_text = prepend_space_to_text
self.append_space_to_text = append_space_to_text
self.add_bos_eos_to_text = add_bos_eos_to_text
if add_bos_eos_to_text:
self.symbols.append("<bos>")
self.symbols.append("<eos>")
# Mappings from symbol to numeric ID and vice versa:
self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
def text_to_sequence(self, text):
sequence = []
# Check for curly braces and treat their contents as phoneme:
while len(text):
m = _curly_re.match(text)
if not m:
sequence += self.symbols_to_sequence(text)
break
sequence += self.symbols_to_sequence(m.group(1))
sequence += self.phoneme_to_sequence(m.group(2))
text = m.group(3)
return sequence
def sequence_to_text(self, sequence):
result = ""
for symbol_id in sequence:
if symbol_id in self.id_to_symbol:
s = self.id_to_symbol[symbol_id]
# Enclose phoneme back in curly braces:
if len(s) > 1 and s[0] == "@":
s = "{%s}" % s[1:]
result += s
return result.replace("}{", " ")
def clean_text(self, text):
text = self.cleaner(text)
return text
def symbols_to_sequence(self, symbols):
return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id]
def phoneme_to_sequence(self, text):
return self.symbols_to_sequence(["@" + s for s in text.split()])
def get_phoneme(self, word):
phoneme_suffix = ""
if word.lower() in self.heteronyms:
return word
if len(word) > 2 and word.endswith("'s"):
phoneme = self.phonemedict.lookup(word)
if phoneme is None:
phoneme = self.phonemedict.lookup(word[:-2])
phoneme_suffix = "" if phoneme is None else " Z"
elif len(word) > 1 and word.endswith("s"):
phoneme = self.phonemedict.lookup(word)
if phoneme is None:
phoneme = self.phonemedict.lookup(word[:-1])
phoneme_suffix = "" if phoneme is None else " Z"
else:
phoneme = self.phonemedict.lookup(word)
if phoneme is None:
return word
if len(phoneme) > 1:
if self.handle_phoneme_ambiguous == "first":
phoneme = phoneme[0]
elif self.handle_phoneme_ambiguous == "random":
phoneme = np.random.choice(phoneme)
elif self.handle_phoneme_ambiguous == "ignore":
return word
else:
phoneme = phoneme[0]
phoneme = "{" + phoneme + phoneme_suffix + "}"
return phoneme
def encode_text(self, text, return_all=False):
text_clean = self.clean_text(text)
text = text_clean
text_phoneme = ""
if self.p_phoneme > 0:
text_phoneme = self.convert_to_phoneme(text)
text = text_phoneme
text_encoded = self.text_to_sequence(text)
if self.prepend_space_to_text:
text_encoded.insert(0, self.symbol_to_id[" "])
if self.append_space_to_text:
text_encoded.append(self.symbol_to_id[" "])
if self.add_bos_eos_to_text:
text_encoded.insert(0, self.symbol_to_id["<bos>"])
text_encoded.append(self.symbol_to_id["<eos>"])
if return_all:
return text_encoded, text_clean, text_phoneme
return text_encoded
def convert_to_phoneme(self, text):
if self.handle_phoneme == "sentence":
if np.random.uniform() < self.p_phoneme:
words = _words_re.findall(text)
text_phoneme = [
self.get_phoneme(word[0])
if (word[0] != "")
else re.sub(r"\s(\d)", r"\1", word[1].upper())
for word in words
]
text_phoneme = "".join(text_phoneme)
text = text_phoneme
elif self.handle_phoneme == "word":
words = _words_re.findall(text)
text_phoneme = [
re.sub(r"\s(\d)", r"\1", word[1].upper())
if word[0] == ""
else (
self.get_phoneme(word[0])
if np.random.uniform() < self.p_phoneme
else word[0]
)
for word in words
]
text_phoneme = "".join(text_phoneme)
text = text_phoneme
elif self.handle_phoneme != "":
raise Exception(
"{} handle_phoneme is not supported".format(self.handle_phoneme)
)
return text