Comparative-Analysis-of-Speech-Synthesis-Models
/
TensorFlowTTS
/tensorflow_tts
/processor
/ljspeechu.py
| # -*- coding: utf-8 -*- | |
| # Copyright 2020 TensorFlowTTS Team. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Perform preprocessing and raw feature extraction for LJSpeech Ultimate dataset.""" | |
| import os | |
| import re | |
| import numpy as np | |
| import soundfile as sf | |
| from dataclasses import dataclass | |
| from tensorflow_tts.processor import BaseProcessor | |
| from tensorflow_tts.utils import cleaners | |
| from tensorflow_tts.utils.utils import PROCESSOR_FILE_NAME | |
| from g2p_en import G2p as grapheme_to_phn | |
| valid_symbols = [ | |
| "AA", | |
| "AA0", | |
| "AA1", | |
| "AA2", | |
| "AE", | |
| "AE0", | |
| "AE1", | |
| "AE2", | |
| "AH", | |
| "AH0", | |
| "AH1", | |
| "AH2", | |
| "AO", | |
| "AO0", | |
| "AO1", | |
| "AO2", | |
| "AW", | |
| "AW0", | |
| "AW1", | |
| "AW2", | |
| "AY", | |
| "AY0", | |
| "AY1", | |
| "AY2", | |
| "B", | |
| "CH", | |
| "D", | |
| "DH", | |
| "EH", | |
| "EH0", | |
| "EH1", | |
| "EH2", | |
| "ER", | |
| "ER0", | |
| "ER1", | |
| "ER2", | |
| "EY", | |
| "EY0", | |
| "EY1", | |
| "EY2", | |
| "F", | |
| "G", | |
| "HH", | |
| "IH", | |
| "IH0", | |
| "IH1", | |
| "IH2", | |
| "IY", | |
| "IY0", | |
| "IY1", | |
| "IY2", | |
| "JH", | |
| "K", | |
| "L", | |
| "M", | |
| "N", | |
| "NG", | |
| "OW", | |
| "OW0", | |
| "OW1", | |
| "OW2", | |
| "OY", | |
| "OY0", | |
| "OY1", | |
| "OY2", | |
| "P", | |
| "R", | |
| "S", | |
| "SH", | |
| "T", | |
| "TH", | |
| "UH", | |
| "UH0", | |
| "UH1", | |
| "UH2", | |
| "UW", | |
| "UW0", | |
| "UW1", | |
| "UW2", | |
| "V", | |
| "W", | |
| "Y", | |
| "Z", | |
| "ZH", | |
| ] | |
| _pad = "pad" | |
| _eos = "eos" | |
| _punctuation = "!'(),.:;?" # Unlike LJSpeech, we do not use spaces since we are phoneme only and spaces lead to very bad attention performance with phonetic input. | |
| _special = "-" | |
| # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): | |
| _arpabet = ["@" + s for s in valid_symbols] | |
| # Export all symbols: | |
| LJSPEECH_U_SYMBOLS = [_pad] + list(_special) + list(_punctuation) + _arpabet + [_eos] | |
| # Regular expression matching text enclosed in curly braces: | |
| _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)") | |
| _arpa_exempt = _punctuation + _special | |
| arpa_g2p = grapheme_to_phn() | |
| class LJSpeechUltimateProcessor(BaseProcessor): | |
| """LJSpeech Ultimate processor.""" | |
| cleaner_names: str = "english_cleaners" | |
| positions = { | |
| "wave_file": 0, | |
| "text_norm": 1, | |
| } | |
| train_f_name: str = "filelist.txt" | |
| def create_items(self): | |
| if self.data_dir: | |
| with open( | |
| os.path.join(self.data_dir, self.train_f_name), encoding="utf-8" | |
| ) as f: | |
| self.items = [self.split_line(self.data_dir, line, "|") for line in f] | |
| def split_line(self, data_dir, line, split): | |
| parts = line.strip().split(split) | |
| wave_file = parts[self.positions["wave_file"]] | |
| text_norm = parts[self.positions["text_norm"]] | |
| wav_path = os.path.join(data_dir, wave_file) | |
| speaker_name = "ljspeech" | |
| return text_norm, wav_path, speaker_name | |
| def setup_eos_token(self): | |
| return _eos | |
| def save_pretrained(self, saved_path): | |
| os.makedirs(saved_path, exist_ok=True) | |
| self._save_mapper(os.path.join(saved_path, PROCESSOR_FILE_NAME), {}) | |
| def to_arpa(self, in_str): | |
| phn_arr = arpa_g2p(in_str) | |
| phn_arr = [x for x in phn_arr if x != " "] | |
| arpa_str = "{" | |
| in_chain = True | |
| # Iterative array-traverse approach to build ARPA string. Phonemes must be in curly braces, but not punctuation | |
| for token in phn_arr: | |
| if token in _arpa_exempt and in_chain: | |
| arpa_str += " }" | |
| in_chain = False | |
| if token not in _arpa_exempt and not in_chain: | |
| arpa_str += " {" | |
| in_chain = True | |
| arpa_str += " " + token | |
| if in_chain: | |
| arpa_str += " }" | |
| return arpa_str | |
| def get_one_sample(self, item): | |
| text, wav_path, speaker_name = item | |
| # Check if this line is already an ARPA string by searching for the trademark curly brace. If not, we apply | |
| if not "{" in text: | |
| text = self.to_arpa(text) | |
| # normalize audio signal to be [-1, 1], soundfile already norm. | |
| audio, rate = sf.read(wav_path) | |
| audio = audio.astype(np.float32) | |
| # convert text to ids | |
| text_ids = np.asarray(self.text_to_sequence(text), np.int32) | |
| sample = { | |
| "raw_text": text, | |
| "text_ids": text_ids, | |
| "audio": audio, | |
| "utt_id": os.path.split(wav_path)[-1].split(".")[0], | |
| "speaker_name": speaker_name, | |
| "rate": rate, | |
| } | |
| return sample | |
| def text_to_sequence(self, text): | |
| sequence = [] | |
| # Check for curly braces and treat their contents as ARPAbet: | |
| while len(text): | |
| m = _curly_re.match(text) | |
| if not m: | |
| sequence += self._symbols_to_sequence( | |
| self._clean_text(text, [self.cleaner_names]) | |
| ) | |
| break | |
| sequence += self._symbols_to_sequence( | |
| self._clean_text(m.group(1), [self.cleaner_names]) | |
| ) | |
| sequence += self._arpabet_to_sequence(m.group(2)) | |
| text = m.group(3) | |
| # add eos tokens | |
| sequence += [self.eos_id] | |
| return sequence | |
| def _clean_text(self, text, cleaner_names): | |
| for name in cleaner_names: | |
| cleaner = getattr(cleaners, name) | |
| if not cleaner: | |
| raise Exception("Unknown cleaner: %s" % name) | |
| text = cleaner(text) | |
| return text | |
| def _symbols_to_sequence(self, symbols): | |
| return [self.symbol_to_id[s] for s in symbols if self._should_keep_symbol(s)] | |
| def _arpabet_to_sequence(self, text): | |
| return self._symbols_to_sequence(["@" + s for s in text.split()]) | |
| def _should_keep_symbol(self, s): | |
| return s in self.symbol_to_id and s != "_" and s != "~" | |