Comparative-Analysis-of-Speech-Synthesis-Models
/
TensorFlowTTS
/tensorflow_tts
/processor
/ljspeechu.py
# -*- coding: utf-8 -*- | |
# Copyright 2020 TensorFlowTTS Team. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Perform preprocessing and raw feature extraction for LJSpeech Ultimate dataset.""" | |
import os | |
import re | |
import numpy as np | |
import soundfile as sf | |
from dataclasses import dataclass | |
from tensorflow_tts.processor import BaseProcessor | |
from tensorflow_tts.utils import cleaners | |
from tensorflow_tts.utils.utils import PROCESSOR_FILE_NAME | |
from g2p_en import G2p as grapheme_to_phn | |
valid_symbols = [ | |
"AA", | |
"AA0", | |
"AA1", | |
"AA2", | |
"AE", | |
"AE0", | |
"AE1", | |
"AE2", | |
"AH", | |
"AH0", | |
"AH1", | |
"AH2", | |
"AO", | |
"AO0", | |
"AO1", | |
"AO2", | |
"AW", | |
"AW0", | |
"AW1", | |
"AW2", | |
"AY", | |
"AY0", | |
"AY1", | |
"AY2", | |
"B", | |
"CH", | |
"D", | |
"DH", | |
"EH", | |
"EH0", | |
"EH1", | |
"EH2", | |
"ER", | |
"ER0", | |
"ER1", | |
"ER2", | |
"EY", | |
"EY0", | |
"EY1", | |
"EY2", | |
"F", | |
"G", | |
"HH", | |
"IH", | |
"IH0", | |
"IH1", | |
"IH2", | |
"IY", | |
"IY0", | |
"IY1", | |
"IY2", | |
"JH", | |
"K", | |
"L", | |
"M", | |
"N", | |
"NG", | |
"OW", | |
"OW0", | |
"OW1", | |
"OW2", | |
"OY", | |
"OY0", | |
"OY1", | |
"OY2", | |
"P", | |
"R", | |
"S", | |
"SH", | |
"T", | |
"TH", | |
"UH", | |
"UH0", | |
"UH1", | |
"UH2", | |
"UW", | |
"UW0", | |
"UW1", | |
"UW2", | |
"V", | |
"W", | |
"Y", | |
"Z", | |
"ZH", | |
] | |
_pad = "pad" | |
_eos = "eos" | |
_punctuation = "!'(),.:;?" # Unlike LJSpeech, we do not use spaces since we are phoneme only and spaces lead to very bad attention performance with phonetic input. | |
_special = "-" | |
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): | |
_arpabet = ["@" + s for s in valid_symbols] | |
# Export all symbols: | |
LJSPEECH_U_SYMBOLS = [_pad] + list(_special) + list(_punctuation) + _arpabet + [_eos] | |
# Regular expression matching text enclosed in curly braces: | |
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)") | |
_arpa_exempt = _punctuation + _special | |
arpa_g2p = grapheme_to_phn() | |
class LJSpeechUltimateProcessor(BaseProcessor): | |
"""LJSpeech Ultimate processor.""" | |
cleaner_names: str = "english_cleaners" | |
positions = { | |
"wave_file": 0, | |
"text_norm": 1, | |
} | |
train_f_name: str = "filelist.txt" | |
def create_items(self): | |
if self.data_dir: | |
with open( | |
os.path.join(self.data_dir, self.train_f_name), encoding="utf-8" | |
) as f: | |
self.items = [self.split_line(self.data_dir, line, "|") for line in f] | |
def split_line(self, data_dir, line, split): | |
parts = line.strip().split(split) | |
wave_file = parts[self.positions["wave_file"]] | |
text_norm = parts[self.positions["text_norm"]] | |
wav_path = os.path.join(data_dir, wave_file) | |
speaker_name = "ljspeech" | |
return text_norm, wav_path, speaker_name | |
def setup_eos_token(self): | |
return _eos | |
def save_pretrained(self, saved_path): | |
os.makedirs(saved_path, exist_ok=True) | |
self._save_mapper(os.path.join(saved_path, PROCESSOR_FILE_NAME), {}) | |
def to_arpa(self, in_str): | |
phn_arr = arpa_g2p(in_str) | |
phn_arr = [x for x in phn_arr if x != " "] | |
arpa_str = "{" | |
in_chain = True | |
# Iterative array-traverse approach to build ARPA string. Phonemes must be in curly braces, but not punctuation | |
for token in phn_arr: | |
if token in _arpa_exempt and in_chain: | |
arpa_str += " }" | |
in_chain = False | |
if token not in _arpa_exempt and not in_chain: | |
arpa_str += " {" | |
in_chain = True | |
arpa_str += " " + token | |
if in_chain: | |
arpa_str += " }" | |
return arpa_str | |
def get_one_sample(self, item): | |
text, wav_path, speaker_name = item | |
# Check if this line is already an ARPA string by searching for the trademark curly brace. If not, we apply | |
if not "{" in text: | |
text = self.to_arpa(text) | |
# normalize audio signal to be [-1, 1], soundfile already norm. | |
audio, rate = sf.read(wav_path) | |
audio = audio.astype(np.float32) | |
# convert text to ids | |
text_ids = np.asarray(self.text_to_sequence(text), np.int32) | |
sample = { | |
"raw_text": text, | |
"text_ids": text_ids, | |
"audio": audio, | |
"utt_id": os.path.split(wav_path)[-1].split(".")[0], | |
"speaker_name": speaker_name, | |
"rate": rate, | |
} | |
return sample | |
def text_to_sequence(self, text): | |
sequence = [] | |
# Check for curly braces and treat their contents as ARPAbet: | |
while len(text): | |
m = _curly_re.match(text) | |
if not m: | |
sequence += self._symbols_to_sequence( | |
self._clean_text(text, [self.cleaner_names]) | |
) | |
break | |
sequence += self._symbols_to_sequence( | |
self._clean_text(m.group(1), [self.cleaner_names]) | |
) | |
sequence += self._arpabet_to_sequence(m.group(2)) | |
text = m.group(3) | |
# add eos tokens | |
sequence += [self.eos_id] | |
return sequence | |
def _clean_text(self, text, cleaner_names): | |
for name in cleaner_names: | |
cleaner = getattr(cleaners, name) | |
if not cleaner: | |
raise Exception("Unknown cleaner: %s" % name) | |
text = cleaner(text) | |
return text | |
def _symbols_to_sequence(self, symbols): | |
return [self.symbol_to_id[s] for s in symbols if self._should_keep_symbol(s)] | |
def _arpabet_to_sequence(self, text): | |
return self._symbols_to_sequence(["@" + s for s in text.split()]) | |
def _should_keep_symbol(self, s): | |
return s in self.symbol_to_id and s != "_" and s != "~" | |