Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| import re | |
| import sys | |
| import json | |
| import codecs | |
| import glob | |
| from unidecode import unidecode | |
| # from g2pc import G2pC | |
| from h2p_parser.h2p import H2p | |
| from num2words import num2words | |
| import pykakasi | |
| import epitran | |
| # https://www.lexilogos.com/keyboard/pinyin_conversion.htm | |
| import nltk | |
| nltk.download('punkt', quiet=True) | |
| from nltk.tokenize import word_tokenize | |
| # I really need to find a better way to do this (handling many different possible entry points) | |
| try: | |
| sys.path.append(".") | |
| from resources.app.python.xvapitch.text.ipa_to_xvaarpabet import ESpeak, ipa2xvaarpabet, PUNCTUATION, ALL_SYMBOLS, PIN_YIN_ENDS, pinyin_to_arpabet_mappings, text_pinyin_to_pinyin_symbs, manual_phone_replacements | |
| from resources.app.python.xvapitch.text.en_numbers import normalize_numbers as en_normalize_numbers | |
| from resources.app.python.xvapitch.text.ro_numbers import generateWords as ro_generateWords | |
| except ModuleNotFoundError: | |
| try: | |
| from python.xvapitch.text.ipa_to_xvaarpabet import ESpeak, ipa2xvaarpabet, PUNCTUATION, ALL_SYMBOLS, PIN_YIN_ENDS, pinyin_to_arpabet_mappings, text_pinyin_to_pinyin_symbs, manual_phone_replacements | |
| from python.xvapitch.text.en_numbers import normalize_numbers as en_normalize_numbers | |
| from python.xvapitch.text.ro_numbers import generateWords as ro_generateWords | |
| except ModuleNotFoundError: | |
| try: | |
| from text.ipa_to_xvaarpabet import ESpeak, ipa2xvaarpabet, PUNCTUATION, ALL_SYMBOLS, PIN_YIN_ENDS, pinyin_to_arpabet_mappings, text_pinyin_to_pinyin_symbs, manual_phone_replacements | |
| from text.en_numbers import normalize_numbers as en_normalize_numbers | |
| from text.ro_numbers import generateWords as ro_generateWords | |
| except ModuleNotFoundError: | |
| from ipa_to_xvaarpabet import ESpeak, ipa2xvaarpabet, PUNCTUATION, ALL_SYMBOLS, PIN_YIN_ENDS, pinyin_to_arpabet_mappings, text_pinyin_to_pinyin_symbs, manual_phone_replacements | |
| from en_numbers import normalize_numbers as en_normalize_numbers | |
| from ro_numbers import generateWords as ro_generateWords | |
| # Processing order: | |
| # - text-to-text, clean up numbers | |
| # - text-to-text, clean up abbreviations | |
| # - text->phone, Custom dict replacements | |
| # - text->phone, Heteronyms detection and replacement | |
| # - text->phone, built-in dicts replacements (eg CMUdict) | |
| # - text->text/phone, missed words ngram/POS splitting, and re-trying built-in dicts (eg CMUdict) | |
| # - text->phone, g2p (eg espeak) | |
| # - phone->[integer], convert phonemes to their index numbers, for use by the models | |
| # class EspeakWrapper(object): | |
| # def __init__(self, base_dir, lang): | |
| # super(EspeakWrapper, self).__init__() | |
| # from phonemizer.backend import EspeakBackend | |
| # from phonemizer.backend.espeak.base import BaseEspeakBackend | |
| # # from phonemizer.backend.espeak import EspeakBackend | |
| # from phonemizer.separator import Separator | |
| # base_dir = f'C:/Program Files/' | |
| # espeak_dll_path = f'{base_dir}/eSpeak_NG/libespeak-ng.dll' | |
| # # espeak_dll_path = f'{base_dir}/libespeak-ng.dll' | |
| # # espeak_dll_path = f'{base_dir}/' | |
| # print(f'espeak_dll_path, {espeak_dll_path}') | |
| # BaseEspeakBackend.set_library(espeak_dll_path) | |
| # # EspeakBackend.set_library(espeak_dll_path) | |
| # self.backend = EspeakBackend(lang) | |
| # print(f'self.backend, {self.backend}') | |
| # self.separator = Separator(phone="|", syllable="", word="") | |
| # print(f'self.separator, {self.separator}') | |
| # def phonemize (self, word): | |
| # return self.backend.phonemize(word, self.separator) | |
| class TextPreprocessor(): | |
| def __init__(self, lang_code, lang_code2, base_dir, add_blank=True, logger=None, use_g2p=True, use_epitran=False): | |
| super(TextPreprocessor, self).__init__() | |
| self.use_g2p = use_g2p | |
| self.use_epitran = use_epitran | |
| self.logger = logger | |
| self.ALL_SYMBOLS = ALL_SYMBOLS | |
| self.lang_code = lang_code | |
| self.lang_code2 = lang_code2 | |
| self.g2p_cache = {} | |
| self.g2p_cache_path = None | |
| self.add_blank = add_blank | |
| self.dicts = [] | |
| self.dict_words = [] # Cache | |
| self.dict_is_custom = [] # Built-in, or custom; Give custom dict entries priority over other pre-processing steps | |
| self._punctuation = '!\'(),.:;? ' # Standard english pronunciation symbols | |
| self.punct_to_whitespace_reg = re.compile(f'[\.,!?]*') | |
| self.espeak = None | |
| self.epitran = None | |
| # self.custom_g2p_fn = None | |
| if lang_code2: | |
| # if self.use_epitran and self.use_g2p: | |
| if self.use_epitran: | |
| self.epitran = epitran.Epitran(self.lang_code2) | |
| elif self.use_g2p: | |
| base_dir = os.path.dirname(os.path.realpath(__file__)) | |
| self.espeak = ESpeak(base_dir, language=self.lang_code2, keep_puncs=True) | |
| self.h2p = None | |
| if lang_code=="en": | |
| self.h2p = H2p(preload=True) | |
| # Regular expression matching text enclosed in curly braces: | |
| self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | |
| self.num2words_fn = None | |
| num2words_supported_langs = ["en","ar","cz","de","dk","en_GB","en_IN","es","es_CO","es_VE","eu","fi","fr","fr_CH","fr_BE","fr_DZ","he","id","it","ja","kn","ko","lt","lv","no","pl","pt","pt_BR","sl","sr","ro","ru","sl","tr","th","vi","nl","uk"] | |
| if lang_code in num2words_supported_langs: | |
| self.num2words_fn = num2words | |
| def init_post(self): | |
| self.re_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in self.abbreviations] | |
| # Override - language specific | |
| def clean_numbers(self, text): | |
| return text | |
| # Override - language specific | |
| def clean_am_pm(self, text): | |
| return text | |
| def clean_abbreviations(self, text): | |
| for regex, replacement in self.re_abbreviations: | |
| text = re.sub(regex, replacement, text) | |
| return text | |
| def collapse_whitespace(self, text): | |
| _whitespace_re = re.compile(r'\s+') | |
| return re.sub(_whitespace_re, ' ', text) | |
| def load_dict (self, dict_path, isCustom=False): | |
| pron_dict = {} | |
| if dict_path.endswith(".txt"): | |
| pron_dict = self.read_txt_dict(dict_path, pron_dict) | |
| elif dict_path.endswith(".json"): | |
| pron_dict = self.read_json_dict(dict_path, pron_dict) | |
| pron_dict = self.post_process_dict(pron_dict) | |
| self.dict_is_custom.append(isCustom) | |
| self.dicts.append(pron_dict) | |
| self.dict_words.append(list(pron_dict.keys())) | |
| # Override | |
| def post_process_dict(self, pron_dict): | |
| return pron_dict | |
| def read_txt_dict (self, dict_path, pron_dict): | |
| with codecs.open(dict_path, encoding="utf-8") as f: | |
| lines = f.read().split("\n") | |
| for line in lines: | |
| if len(line.strip()): | |
| # if len(line.strip()) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): | |
| word = line.split(" ")[0].lower() | |
| pron = " ".join(line.split(" ")[1:]).strip().upper() | |
| # TODO? Check if the phonemes are valid? | |
| # TODO? Handle variants(1) | |
| pron_dict[word] = pron | |
| return pron_dict | |
| def read_json_dict (self, dict_path, pron_dict): | |
| with codecs.open(dict_path, encoding="utf-8") as f: | |
| json_data = json.load(f) | |
| for word in list(json_data["data"].keys()): | |
| if json_data["data"][word]["enabled"]==True: | |
| # TODO? Check if the phonemes are valid? | |
| # TODO? Handle variants(1) | |
| pron_dict[word.lower()] = json_data["data"][word]["arpabet"].upper() | |
| return pron_dict | |
| def dict_replace (self, text, customDicts): | |
| for di, pron_dict in enumerate(self.dicts): | |
| if (customDicts and self.dict_is_custom[di]) or (not customDicts and not self.dict_is_custom[di]): | |
| dict_words = self.dict_words[di] | |
| text_graphites = re.sub("{([^}]*)}", "", text, flags=re.IGNORECASE) | |
| # Don't run the ARPAbet replacement for every single word, as it would be too slow. Instead, do it only for words that are actually present in the prompt | |
| words_in_prompt = (text_graphites+" ").replace("}","").replace("{","").replace(",","").replace("?","").replace("!","").replace(";","").replace(":","").replace("...",".").replace(". "," ").lower().split(" ") | |
| words_in_prompt = [word.strip() for word in words_in_prompt if len(word.strip()) and word.lower() in dict_words] | |
| if len(words_in_prompt): | |
| # Pad out punctuation, to make sure they don't get used in the word look-ups | |
| text = " "+text.replace(",", " ,").replace(".", " .").replace("!", " !").replace("?", " ?")+" " | |
| for di, dict_word in enumerate(words_in_prompt): | |
| dict_word_with_spaces = "{"+pron_dict[dict_word]+"}" | |
| dict_word_replace = dict_word.strip().replace(".", "\.").replace("(", "\(").replace(")", "\)") | |
| # Do it twice, because re will not re-use spaces, so if you have two neighbouring words to be replaced, | |
| # and they share a space character, one of them won't get changed | |
| for _ in range(2): | |
| text = re.sub(r'(?<!\{)\b'+dict_word_replace+r'\b(?![\w\s\(\)]*[\}])', dict_word_with_spaces, text, flags=re.IGNORECASE) | |
| # Undo the punctuation padding, to retain the original sentence structure | |
| text = text.replace(" ,", ",").replace(" .", ".").replace(" !", "!").replace(" ?", "?") | |
| text = re.sub("^\s+", " ", text) if text.startswith(" ") else re.sub("^\s*", "", text) | |
| text = re.sub("\s+$", " ", text) if text.endswith(" ") else re.sub("\s*$", "", text) | |
| return text | |
| def detect_and_fill_heteronyms (self, text): | |
| if self.h2p is not None: | |
| text = self.h2p.replace_het(text) | |
| return text | |
| def clean_POS_and_subword_misses (self, text): | |
| # Eg plurals, possessives, contractions, hyphenated, compounds, stem, etc | |
| # TODO | |
| return text | |
| def load_g2p_cache (self, cache_path): | |
| # print(f'[DEBUG] Loading cache: {cache_path}') | |
| self.g2p_cache_path = cache_path | |
| if os.path.exists(cache_path): | |
| with open(cache_path, encoding="utf8") as f: | |
| lines = f.read().split("\n") | |
| for line in lines: | |
| if "|" in line: | |
| word = line.split("|")[0] | |
| phones = "|".join(line.split("|")[1:]) | |
| self.g2p_cache[word.lower().strip()] = phones.strip() | |
| else: | |
| print(f'g2p cache file not found at: {cache_path}') | |
| def save_g2p_cache (self): | |
| if self.g2p_cache_path: | |
| cache_out = [] | |
| cache_keys = sorted(list(self.g2p_cache.keys())) | |
| for key in cache_keys: | |
| cache_out.append(f'{key}|{self.g2p_cache[key]}') | |
| with open(self.g2p_cache_path, "w+", encoding="utf8") as f: | |
| f.write("\n".join(cache_out)) | |
| # Override | |
| def fill_missing_via_g2p (self, text): | |
| # TODO, switch to from nltk.tokenize import word_tokenize | |
| orig_text = text | |
| # print(f'[g2p] orig_text, |{orig_text}|') | |
| text_parts = text.split("{") | |
| text_parts2 = [(part.split("}")[1] if "}" in part else part) for part in text_parts] | |
| # print(f'[g2p] text_parts, {text_parts}') | |
| # print(f'[g2p] text_parts2, {text_parts2}') | |
| phonemised = [] | |
| for part in text_parts2: | |
| words = part.split(" ") | |
| part_phonemes = [] | |
| for word in words: | |
| word = word.strip() | |
| if len(word): | |
| # print(f'\n[g2p] word, {word}') | |
| sub_parts = [] | |
| sub_part_phonemes = [] | |
| # ====== punctuation stuff start ======== | |
| # Get which punctuation symbols are contained in the text fragment | |
| puncs_contained = [] | |
| for punc in PUNCTUATION: | |
| if punc in word: | |
| puncs_contained.append(punc) | |
| # Split away the punctuation from text | |
| sub_parts = [word] | |
| # print(f'puncs_contained, {puncs_contained}') | |
| if len(puncs_contained): | |
| for punc in puncs_contained: | |
| # init a new sub part list (list 2) | |
| sub_parts2 = [] | |
| # for each sub part... | |
| for sp in sub_parts: | |
| sp = sp.strip() | |
| # ...if it not already a punctuation symbol, try splitting it by the current punctuation symbol | |
| if sp not in PUNCTUATION: | |
| sp_split = sp.split(punc) | |
| # if the split list length is 1, add to list 2 | |
| if len(sp_split)==1: | |
| sub_parts2.append(sp_split[0]) | |
| else: | |
| # if it's more than 1 | |
| # print(f'sp_split, {sp_split}') | |
| for spspi, sps_part in enumerate(sp_split): | |
| # iterate through each item, and add to list, but also add the punct, apart from the last item | |
| sub_parts2.append(sps_part) | |
| if spspi<(len(sp_split)-1): | |
| sub_parts2.append(punc) | |
| else: | |
| # otherwise add the punct to list 2 | |
| sub_parts2.append(sp) | |
| # set the sub parts list to list 2, for the next loop, or ready | |
| sub_parts = sub_parts2 | |
| else: | |
| sub_parts = [word] | |
| # ====== punctuation stuff end ======== | |
| # print(f'sub_parts, {sub_parts}') | |
| for sp in sub_parts: | |
| if sp in PUNCTUATION: | |
| sub_part_phonemes.append(sp) | |
| else: | |
| sp = sp.replace("\"", "").replace(")", "").replace("(", "").replace("]", "").replace("[", "").strip() | |
| if len(sp): | |
| # print(f'sp, {sp}') | |
| if sp.lower() in self.g2p_cache.keys() and len(self.g2p_cache[sp.lower()].strip()): | |
| # print("in cache") | |
| g2p_out = ipa2xvaarpabet(self.g2p_cache[sp.lower()]) | |
| # print(f'g2p_out, {g2p_out}') | |
| sub_part_phonemes.append(g2p_out) | |
| else: | |
| if self.use_g2p or "custom_g2p_fn" in dir(self) or self.use_epitran: | |
| # print(f'self.custom_g2p_fn, {self.custom_g2p_fn}') | |
| if "custom_g2p_fn" in dir(self): | |
| g2p_out = self.custom_g2p_fn(sp) | |
| elif self.use_epitran: | |
| g2p_out = self.epitran.transliterate(sp) | |
| else: | |
| g2p_out = self.espeak.phonemize(sp).replace("|", " ") | |
| # print(f'g2p_out, {g2p_out}') | |
| self.g2p_cache[sp.lower()] = g2p_out | |
| self.save_g2p_cache() | |
| g2p_out = ipa2xvaarpabet(g2p_out) | |
| # print(f'g2p_out, {g2p_out}') | |
| sub_part_phonemes.append(g2p_out) | |
| # print(f'sp, {sp} ({len(self.g2p_cache.keys())}) {g2p_out}') | |
| part_phonemes.append(" ".join(sub_part_phonemes)) | |
| phonemised.append(" _ ".join(part_phonemes)) | |
| # print("--") | |
| # print(f'text_parts ({len(text_parts)}), {text_parts}') | |
| # print(f'[g2p] phonemised ({len(phonemised)}), {phonemised}') | |
| text = [] | |
| for ppi, phon_part in enumerate(phonemised): | |
| # print(f'phon_part, {phon_part}') | |
| prefix = "" | |
| if "}" in text_parts[ppi]: | |
| if ppi<len(phonemised)-1 and text_parts[ppi].split("}")[1].startswith(" "): | |
| prefix = text_parts[ppi].split("}")[0]+" _ " | |
| else: | |
| prefix = text_parts[ppi].split("}")[0]+" " | |
| text.append(f'{prefix} {phon_part}') | |
| # print(f'[g2p] text ({len(text)}), {text}') | |
| text_final = [] | |
| for tpi, text_part in enumerate(text): | |
| if tpi!=0 or text_part.strip()!="" or not orig_text.startswith("{"): | |
| # print(not orig_text.startswith("{"), tpi, f'|{text_part.strip()}|') | |
| text_final.append(text_part) | |
| if (tpi or orig_text.startswith(" ")) and ((tpi<len(text_parts2)-1 and text_parts2[tpi+1].startswith(" ")) or text_parts2[tpi].endswith(" ")): | |
| # print("adding _") | |
| text_final.append("_") | |
| text = " ".join(text_final).replace(" ", " ").replace(" ", " ").replace(" _ _ ", " _ ").replace(" _ _ ", " _ ") | |
| return text | |
| # Convert IPA fragments not already replaced by dicts/rules via espeak and post-processing | |
| def ipa_to_xVAARPAbet (self, ipa_text): | |
| xVAARPAbet = ipa2xvaarpabet(ipa_text) | |
| return xVAARPAbet | |
| def clean_special_chars(self, text): | |
| # quotemarks " can create problems for dict_replace | |
| # remove the ones at start and end of text | |
| if text.strip()[0]:\ | |
| text = text.replace("\"", "", 1) | |
| if text.strip()[-1]:\ | |
| text = text[::-1].replace("\"", "", 1)[::-1] | |
| # replace all others with a colon | |
| text = text.replace("\"", ":") | |
| # remove asterisks | |
| text = text.replace("*","") | |
| return text | |
| def text_to_phonemes (self, text): | |
| text = self.clean_special_chars(text) | |
| text = self.collapse_whitespace(text).replace(" }", "}").replace("{ ", "{") | |
| text = self.clean_am_pm(text) | |
| text = self.clean_numbers(text) | |
| # print(f'clean_numbers: |{text}|') | |
| text = self.clean_abbreviations(text) | |
| # print(f'clean_abbreviations: |{text}|') | |
| text = self.dict_replace(text, customDicts=True) | |
| # print(f'dict_replace(custom): |{text}|') | |
| text = self.detect_and_fill_heteronyms(text) | |
| # print(f'detect_and_fill_heteronyms: |{text}|') | |
| text = self.dict_replace(text, customDicts=False) | |
| # print(f'dict_replace(built-in):, |{text}|') | |
| text = self.clean_POS_and_subword_misses(text) | |
| # print(f'clean_POS_and_subword_misses: |{text}|') | |
| text = self.fill_missing_via_g2p(text) | |
| # print(f'fill_missing_via_g2p: |{text}|') | |
| return text | |
| # Main entry-point for pre-processing text completely into phonemes | |
| # This converts not the phonemes, but to the index numbers for the phonemes list, as required by the models | |
| def text_to_sequence (self, text): | |
| orig_text = text | |
| text = self.text_to_phonemes(text) # Get 100% phonemes from the text | |
| text = self.collapse_whitespace(text).strip() # Get rid of duplicate/padding spaces | |
| phonemes = text.split(" ") | |
| phonemes_final = [] | |
| for pi,phone in enumerate(phonemes): | |
| if phone in manual_phone_replacements.keys(): | |
| phonemes_final.append(manual_phone_replacements[phone]) | |
| else: | |
| phonemes_final.append(phone) | |
| # print(f'phonemes, {phonemes}') | |
| # with open(f'F:/Speech/xva-trainer/python/xvapitch/text_prep/debug.txt', "w+") as f: | |
| # f.write(" ".join(phonemes)) | |
| # sequence = [ALL_SYMBOLS.index(phone) for phone in phonemes] | |
| # blacklist = ["#"] | |
| try: | |
| sequence = [] | |
| for phone in phonemes_final: | |
| if phone=="#": # The g2p something returns things like "# foreign french". Cut away the commented out stuff, when this happens | |
| break | |
| if len(phone.strip()): | |
| sequence.append(ALL_SYMBOLS.index(phone)) | |
| # sequence = [ALL_SYMBOLS.index(phone) for phone in phonemes_final if len(phone) and phone.strip() not in blacklist] | |
| except: | |
| print(orig_text, phonemes_final) | |
| raise | |
| # Intersperse blank symbol if required | |
| if self.add_blank: | |
| sequence_ = [] | |
| for si,symb in enumerate(sequence): | |
| sequence_.append(symb) | |
| if si<len(sequence)-1: | |
| # sequence_.append(len(ALL_SYMBOLS)-1) | |
| sequence_.append(len(ALL_SYMBOLS)-2) | |
| sequence = sequence_ | |
| cleaned_text = "|".join([ALL_SYMBOLS[index] for index in sequence]) | |
| return sequence, cleaned_text | |
| def cleaned_text_to_sequence (self, text): | |
| text = self.collapse_whitespace(text).strip() # Get rid of duplicate/padding spaces | |
| phonemes = text.split(" ") | |
| sequence = [ALL_SYMBOLS.index(phone) for phone in phonemes] | |
| return sequence | |
| def sequence_to_text (self, sequence): # Used in debugging | |
| text = [] | |
| for ind in sequence[0]: | |
| text.append(ALL_SYMBOLS[ind]) | |
| return text | |
| class EnglishTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(EnglishTextPreprocessor, self).__init__("en", "en-us", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "English" | |
| self.abbreviations = [ | |
| ('mrs', 'misess'), | |
| ('mr', 'mister'), | |
| ('dr', 'doctor'), | |
| ('st', 'saint'), | |
| ('jr', 'junior'), | |
| ('maj', 'major'), | |
| ('drs', 'doctors'), | |
| ('rev', 'reverend'), | |
| ('lt', 'lieutenant'), | |
| ('sgt', 'sergeant'), | |
| ('capt', 'captain'), | |
| ('esq', 'esquire'), | |
| ('ltd', 'limited'), | |
| ('col', 'colonel'), | |
| ('ft', 'fort'), | |
| ] | |
| self.init_post() | |
| # from en_numbers import normalize_numbers | |
| self.normalize_numbers = en_normalize_numbers | |
| def post_process_dict (self, pron_dict): | |
| # CMUdict doesn't contain the symbols on the left. Therefore, these must be mapped to symbols that the models have actually | |
| # been trained with. This is only the case for CMUdict, so for English-trained models | |
| ARPAbet_replacements_dict = { | |
| "YO": "IY0 UW0", | |
| "UH": "UH0", | |
| "AR": "R", | |
| "EY": "EY0", | |
| "A": "AA0", | |
| "AW": "AW0", | |
| "X": "K S", | |
| "CX": "K HH", | |
| "AO": "AO0", | |
| "PF": "P F", | |
| "AY": "AY0", | |
| "OE": "OW0 IY0", | |
| "IY": "IY0", | |
| "EH": "EH0", | |
| "OY": "OY0", | |
| "IH": "IH0", | |
| "H": "HH" | |
| } | |
| for word in pron_dict.keys(): | |
| phonemes = pron_dict[word] | |
| for key in ARPAbet_replacements_dict.keys(): | |
| phonemes = phonemes.replace(f' {key} ', f' {ARPAbet_replacements_dict[key]} ') | |
| # Do it twice, because re will not re-use spaces, so if you have two neighbouring phonemes to be replaced, | |
| # and they share a space character, one of them won't get changed | |
| phonemes = phonemes.replace(f' {key} ', f' {ARPAbet_replacements_dict[key]} ') | |
| pron_dict[word] = phonemes | |
| return pron_dict | |
| def clean_am_pm (self, text): | |
| words_out = [] | |
| numerals = ["0","1","2","3","4","5","6","7","8","9"] | |
| spelled_out = ["teen","one", "two", "three", "four", "five", "six", "seven", "eight", "nine","ten","twenty","thirty","forty","fivty","o'clock"] | |
| words = text.split(" ") | |
| for word in words: | |
| if word[:2].lower().strip()=="am": | |
| finishes_with_spelled_out_numeral = False | |
| for spelled_out_n in spelled_out: | |
| if len(words_out) and words_out[-1].endswith(spelled_out_n): | |
| finishes_with_spelled_out_numeral = True | |
| break | |
| if len(words_out) and words_out[-1] != '' and words_out[-1][-1] in numerals or finishes_with_spelled_out_numeral: | |
| word = "{EY0 IH0} {EH0 M}"+word[2:] | |
| words_out.append(word) | |
| return " ".join(words_out) | |
| def clean_numbers (self, text): | |
| # This (inflect code) also does things like currency, magnitudes, etc | |
| final_parts = [] | |
| # print(f'text, {text}') | |
| parts = re.split("({([^}]*)})", text) | |
| skip_next = False | |
| for part in parts: | |
| if "{" in part: | |
| final_parts.append(part) | |
| skip_next = True | |
| # print(f'[clean_numbers] keeping: {part}') | |
| else: | |
| if skip_next: | |
| skip_next = False | |
| else: | |
| # print(f'[clean_numbers] doing: {part}') | |
| final_parts.append(self.normalize_numbers(part)) | |
| text = "".join(final_parts) | |
| # print(f'[clean_numbers] parts, {parts}') | |
| return text | |
| # return self.normalize_numbers(text) | |
| def text_to_sequence(self, text): | |
| text = unidecode(text) # transliterate non-english letters to English, if they can be ascii | |
| return super(EnglishTextPreprocessor, self).text_to_sequence(text) | |
| class FrenchTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(FrenchTextPreprocessor, self).__init__("fr", "fr-fr", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "French" | |
| self.abbreviations = [ | |
| ("M", "monsieur"), | |
| ("Mlle", "mademoiselle"), | |
| ("Mlles", "mesdemoiselles"), | |
| ("Mme", "Madame"), | |
| ("Mmes", "Mesdames"), | |
| ("N.B", "nota bene"), | |
| ("M", "monsieur"), | |
| ("p.c.q", "parce que"), | |
| ("Pr", "professeur"), | |
| ("qqch", "quelque chose"), | |
| ("rdv", "rendez-vous"), | |
| ("no", "numéro"), | |
| ("adr", "adresse"), | |
| ("dr", "docteur"), | |
| ("st", "saint"), | |
| ("jr", "junior"), | |
| ("sgt", "sergent"), | |
| ("capt", "capitain"), | |
| ("col", "colonel"), | |
| ("av", "avenue"), | |
| ("av. J.-C", "avant Jésus-Christ"), | |
| ("apr. J.-C", "après Jésus-Christ"), | |
| ("boul", "boulevard"), | |
| ("c.-à-d", "c’est-à-dire"), | |
| ("etc", "et cetera"), | |
| ("ex", "exemple"), | |
| ("excl", "exclusivement"), | |
| ("boul", "boulevard"), | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| # https://github.com/virgil-av/numbers-to-words-romanian/blob/master/src/index.ts | |
| class RomanianTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(RomanianTextPreprocessor, self).__init__("ro", "ro", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Romanian" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = ro_generateWords | |
| self.init_post() | |
| class ItalianTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(ItalianTextPreprocessor, self).__init__("it", "it", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Italian" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class DanishTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(DanishTextPreprocessor, self).__init__("da", "da", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Danish" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class GermanTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(GermanTextPreprocessor, self).__init__("de", "de", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "German" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class AmharicTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(AmharicTextPreprocessor, self).__init__("am", "amh-Ethi", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Amharic" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class ArabicTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(ArabicTextPreprocessor, self).__init__("ar", "ar", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Arabic" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class MongolianTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(MongolianTextPreprocessor, self).__init__("mn", "mon-Cyrl", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Mongolian" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class DutchTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(DutchTextPreprocessor, self).__init__("nl", "nl", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Dutch" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class FinnishTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(FinnishTextPreprocessor, self).__init__("fi", "fi", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Finnish" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class GreekTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(GreekTextPreprocessor, self).__init__("el", "el", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Greek" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class HausaTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(HausaTextPreprocessor, self).__init__("ha", "hau-Latn", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Hausa" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class HindiTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(HindiTextPreprocessor, self).__init__("hi", "hi", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Hindi" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class HungarianTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(HungarianTextPreprocessor, self).__init__("hu", "hu", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Hungarian" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class JapaneseTextPreprocessor(TextPreprocessor): | |
| # Japanese: https://github.com/coqui-ai/TTS/blob/main/TTS/tts/utils/text/japanese/phonemizer.py | |
| # https://pypi.org/project/pykakasi/ | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(JapaneseTextPreprocessor, self).__init__("jp", "ja", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Japanese" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| def text_to_phonemes (self, line): | |
| kks = pykakasi.kakasi() | |
| line = kks.convert(line) | |
| line = " ".join([part["hira"] for part in line]) | |
| # print(f'line, {line}') | |
| return super(JapaneseTextPreprocessor, self).text_to_phonemes(line) | |
| class KoreanTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(KoreanTextPreprocessor, self).__init__("ko", "ko", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Korean" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class LatinTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(LatinTextPreprocessor, self).__init__("la", "la", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Latin" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class PolishTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(PolishTextPreprocessor, self).__init__("pl", "pl", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Polish" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class PortugueseTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(PortugueseTextPreprocessor, self).__init__("pt", "pt", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Portuguese" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class RussianTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(RussianTextPreprocessor, self).__init__("ru", "ru", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Russian" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class SpanishTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(SpanishTextPreprocessor, self).__init__("es", "es", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Spanish" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class SwahiliTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(SwahiliTextPreprocessor, self).__init__("sw", "sw", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Swahili" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class SwedishTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(SwedishTextPreprocessor, self).__init__("sv", "sv", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Swedish" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| # from thai_segmenter import sentence_segment | |
| class ThaiTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| # super(ThaiTextPreprocessor, self).__init__("th", "th", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| super(ThaiTextPreprocessor, self).__init__("th", "tha-Thai", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| # super(ThaiTextPreprocessor, self).__init__("th", "hau-Latn", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=True) | |
| self.lang_name = "Thai" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| def text_to_phonemes (self, line): | |
| final_line = line | |
| # try: | |
| # line = line.encode('utf8', errors='ignore').decode('utf8', errors='ignore') | |
| # sentence_parts = sentence_segment(line) | |
| # for part in list(sentence_parts): | |
| # for sub_part in part.pos: | |
| # final_line.append(sub_part[0]) | |
| # final_line.append(".") | |
| # final_line = " ".join(final_line) | |
| # except: | |
| # pass | |
| return super(ThaiTextPreprocessor, self).text_to_phonemes(final_line) | |
| class TurkishTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(TurkishTextPreprocessor, self).__init__("tr", "tr", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Turkish" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class UkrainianTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(UkrainianTextPreprocessor, self).__init__("uk", "uk", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Ukrainian" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class VietnameseTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(VietnameseTextPreprocessor, self).__init__("vi", "vi", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Vietnamese" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| # https://polyglotclub.com/wiki/Language/Wolof/Pronunciation/Alphabet-and-Pronunciation#:~:text=Wolof%20Alphabet,-VowelsEdit&text=Single%20vowels%20are%20short%2C%20geminated,British)%20English%20%22sawed%22. | |
| # https://huggingface.co/abdouaziiz/wav2vec2-xls-r-300m-wolof | |
| class WolofTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(WolofTextPreprocessor, self).__init__("wo", "wo", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=False) | |
| self.lang_name = "Wolof" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| # A very basic, lossy Wolof -> IPA converter. There were no g2p libraries supporting Wolof at the time of writing. It was this or nothing. | |
| def custom_g2p_fn(self, word): | |
| # print(f'custom_g2p_fn | IN: {word}') | |
| word = word.lower() | |
| # lossy | |
| word = word.replace("à", "a") | |
| word = word.replace("ó", "o") | |
| word = word.replace("aa", "aː") | |
| word = re.sub('a(?!:)', 'ɐ', word) | |
| word = word.replace("bb", "bː") | |
| word = word.replace("cc", "cːʰ") | |
| word = word.replace("dd", "dː") | |
| word = word.replace("ee", "ɛː") | |
| word = word.replace("ée", "eː") | |
| word = word.replace("ëe", "əː") | |
| word = re.sub('e(?!:)', 'ɛ', word) | |
| word = re.sub('ë(?!:)', 'ə', word) | |
| word = word.replace("gg", "gː") | |
| word = word.replace("ii", "iː") | |
| word = word.replace("jj", "ɟːʰ") | |
| word = re.sub('j(?!:)', 'ɟ', word) | |
| word = word.replace("kk", "kːʰ") | |
| word = word.replace("ll", "ɫː") | |
| word = word.replace("mb", "m̩b") | |
| word = word.replace("mm", "mː") | |
| word = word.replace("nc", "ɲc") | |
| word = word.replace("nd", "n̩d") | |
| word = word.replace("ng", "ŋ̩g") | |
| word = word.replace("nj", "ɲɟ") | |
| word = word.replace("nk", "ŋ̩k") | |
| word = word.replace("nn", "nː") | |
| word = word.replace("nq", "ɴq") | |
| word = word.replace("nt", "n̩t") | |
| word = word.replace("ññ", "ɲː") | |
| word = word.replace("ŋŋ", "ŋː") | |
| word = re.sub('ñ(?!:)', 'ɲ', word) | |
| word = word.replace("oo", "oː") | |
| word = word.replace("o", "ɔ") | |
| word = word.replace("pp", "pːʰ") | |
| word = word.replace("rr", "rː") | |
| word = word.replace("tt", "tːʰ") | |
| word = word.replace("uu", "uː") | |
| word = word.replace("ww", "wː") | |
| word = word.replace("yy", "jː") | |
| word = word.replace("y", "j") | |
| # lossy | |
| word = word.replace("é", "e") | |
| word = word.replace("ë", "e") | |
| word = word.replace("ñ", "n") | |
| word = word.replace("ŋ", "n") | |
| # print(f'custom_g2p_fn | OUT: {word}') | |
| return word | |
| # def save_g2p_cache(self): | |
| # # TEMPORARY | |
| # pass | |
| class YorubaTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(YorubaTextPreprocessor, self).__init__("yo", "yor-Latn", base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Yoruba" | |
| self.abbreviations = [ | |
| ] | |
| self.normalize_numbers = self.num2words_fn | |
| self.init_post() | |
| class ChineseTextPreprocessor(TextPreprocessor): | |
| def __init__(self, base_dir, logger=None, use_g2p=True, use_epitran=False): | |
| super(ChineseTextPreprocessor, self).__init__("zh", None, base_dir, logger=logger, use_g2p=use_g2p, use_epitran=use_epitran) | |
| self.lang_name = "Chinese" | |
| self.abbreviations = [ | |
| ] | |
| self.init_post() | |
| # self.g2p = None | |
| # if self.use_g2p: | |
| # self.g2p = G2pC() | |
| from g2pc import G2pC | |
| self.g2p = G2pC() | |
| self.TEMP_unhandled = [] | |
| def split_pinyin (self, pinyin): | |
| symbs_split = [] | |
| pinyin = pinyin.lower() | |
| splitting_symbs = ["zh", "ch", "sh", "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "z", "c", "s", "r", "j", "q", "x"] | |
| for ss in splitting_symbs: | |
| # if phon.startswith(ss) and not phon.endswith("i"): | |
| if pinyin.startswith(ss): | |
| symbs_split.append(ss.upper()) | |
| pinyin = pinyin[len(ss):] | |
| break | |
| symbs_split.append(pinyin.upper()) | |
| return symbs_split | |
| def post_process_pinyin_symbs (self, symbs): | |
| post_processed = [] | |
| # splitting_symbs = ["zh", "ch", "sh", "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "z", "c", "s", "r", "j", "q", "x"] | |
| for symb in symbs.split(" "): | |
| if len(symb)==0: | |
| continue | |
| symbs = self.split_pinyin(symb) | |
| for symb in symbs: | |
| post_processed.append(symb) | |
| # for ss in splitting_symbs: | |
| # # if phon.startswith(ss) and not phon.endswith("i"): | |
| # if symb.startswith(ss): | |
| # post_processed.append(ss.upper()) | |
| # symb = symb[len(ss):] | |
| # break | |
| # post_processed.append(symb.upper()) | |
| return " ".join(post_processed) | |
| def fill_missing_via_g2p_zh (self, text): | |
| # TODO, switch to from nltk.tokenize import word_tokenize | |
| orig_text = text | |
| # print(f'[g2p] orig_text, |{orig_text}|') | |
| text_parts = text.split("{") | |
| text_parts2 = [(part.split("}")[1] if "}" in part else part) for part in text_parts] | |
| # print(f'[g2p] text_parts, {text_parts}') | |
| # print(f'[g2p] text_parts2, {text_parts2}') | |
| phonemised = [] | |
| for part in text_parts2: | |
| # words = part.split(" ") | |
| words = [part] | |
| part_phonemes = [] | |
| for word in words: | |
| word = word.strip() | |
| if len(word): | |
| # print(f'[g2p] word, {word}') | |
| sub_parts = [] | |
| sub_part_phonemes = [] | |
| # ====== punctuation stuff start ======== | |
| # Get which punctuation symbols are contained in the text fragment | |
| puncs_contained = [] | |
| for punc in PUNCTUATION: | |
| if punc in word: | |
| puncs_contained.append(punc) | |
| # Split away the punctuation from text | |
| sub_parts = [word] | |
| # print(f'puncs_contained, {puncs_contained}') | |
| if len(puncs_contained): | |
| for punc in puncs_contained: | |
| # init a new sub part list (list 2) | |
| sub_parts2 = [] | |
| # for each sub part... | |
| for sp in sub_parts: | |
| sp = sp.strip() | |
| # ...if it not already a punctuation symbol, try splitting it by the current punctuation symbol | |
| if sp not in PUNCTUATION: | |
| sp_split = sp.split(punc) | |
| # if the split list length is 1, add to list 2 | |
| if len(sp_split)==1: | |
| sub_parts2.append(sp_split[0]) | |
| else: | |
| # if it's more than 1 | |
| # print(f'sp_split, {sp_split}') | |
| for spspi, sps_part in enumerate(sp_split): | |
| # iterate through each item, and add to list, but also add the punct, apart from the last item | |
| sub_parts2.append(sps_part) | |
| if spspi<(len(sp_split)-1): | |
| sub_parts2.append(punc) | |
| else: | |
| # otherwise add the punct to list 2 | |
| sub_parts2.append(sp) | |
| # set the sub parts list to list 2, for the next loop, or ready | |
| sub_parts = sub_parts2 | |
| else: | |
| sub_parts = [word] | |
| # ====== punctuation stuff end ======== | |
| # print(f'sub_parts, {sub_parts}') | |
| for sp in sub_parts: | |
| if sp in PUNCTUATION: | |
| sub_part_phonemes.append(sp) | |
| else: | |
| sp = sp.replace("\"", "").replace(")", "").replace("(", "").replace("]", "").replace("[", "").strip() | |
| if len(sp): | |
| if sp.lower() in self.g2p_cache.keys() and len(self.g2p_cache[sp.lower()].strip()): | |
| g2p_out = self.g2p_cache[sp.lower()] | |
| g2p_out = self.post_process_pinyin_symbs(g2p_out) | |
| sub_part_phonemes.append(g2p_out) | |
| else: | |
| # print(f'sp, {sp} ({len(self.g2p_cache.keys())})') | |
| # g2p_out = self.espeak.phonemize(sp).replace("|", " ") | |
| g2p_out = self.g2p(sp) | |
| g2p_out = " ".join([out_part[2] for out_part in g2p_out]) | |
| self.g2p_cache[sp.lower()] = g2p_out | |
| self.save_g2p_cache() | |
| # g2p_out = ipa2xvaarpabet(g2p_out) | |
| g2p_out = self.post_process_pinyin_symbs(g2p_out) | |
| # print(f'g2p_out, {g2p_out}') | |
| sub_part_phonemes.append(g2p_out) | |
| part_phonemes.append(" ".join(sub_part_phonemes)) | |
| phonemised.append(" _ ".join(part_phonemes)) | |
| # print("--") | |
| # print(f'text_parts ({len(text_parts)}), {text_parts}') | |
| # print(f'[g2p] phonemised ({len(phonemised)}), {phonemised}') | |
| text = [] | |
| for ppi, phon_part in enumerate(phonemised): | |
| # print(f'phon_part, {phon_part}') | |
| prefix = "" | |
| if "}" in text_parts[ppi]: | |
| if ppi<len(phonemised)-1 and text_parts[ppi].split("}")[1].startswith(" "): | |
| prefix = text_parts[ppi].split("}")[0]+" _ " | |
| else: | |
| prefix = text_parts[ppi].split("}")[0]+" " | |
| text.append(f'{prefix} {phon_part}') | |
| # print(f'[g2p] text ({len(text)}), {text}') | |
| text_final = [] | |
| for tpi, text_part in enumerate(text): | |
| if tpi!=0 or text_part.strip()!="" or not orig_text.startswith("{"): | |
| # print(not orig_text.startswith("{"), tpi, f'|{text_part.strip()}|') | |
| text_final.append(text_part) | |
| if (tpi or orig_text.startswith(" ")) and ((tpi<len(text_parts2)-1 and text_parts2[tpi+1].startswith(" ")) or text_parts2[tpi].endswith(" ")): | |
| # print("adding _") | |
| text_final.append("_") | |
| text = " ".join(text_final).replace(" ", " ").replace(" ", " ").replace(" _ _ ", " _ ").replace(" _ _ ", " _ ") | |
| return text | |
| def preprocess_pinyin (self, text): | |
| # self.logger.info(f'preprocess_pinyin word_tokenize: {word_tokenize(text)}') | |
| tokens = word_tokenize(text) | |
| final_out = [] | |
| is_inside_inline_arpabet = False # Used for determining whether to handle token as grapheme of inlined phonemes (or already preproccessed phonemes) | |
| # has_included_inlint_arpabet_start = False # Used to determine if to insert the inline phoneme delimiter start { | |
| for token in tokens: | |
| if token.startswith("{"): | |
| is_inside_inline_arpabet = True | |
| # if len(token.replace("{", "")): | |
| # final_out.append(token.replace("{", "")) | |
| final_out.append(token) | |
| if token.endswith("}"): | |
| is_inside_inline_arpabet = False | |
| final_out.append(token) | |
| if is_inside_inline_arpabet: # The token is an already processed phoneme, from inline or previously processed phonemes. Include without changes | |
| final_out.append(token) | |
| continue | |
| text = text_pinyin_to_pinyin_symbs(token) | |
| text_final = [] | |
| text = text.upper().split(" ") | |
| # self.logger.info(f'preprocess_pinyin text: {text}') | |
| for part in text: | |
| # self.logger.info(f'preprocess_pinyin part: {part}') | |
| final_parts = [] | |
| # split_symbs = [] | |
| do_again = True | |
| # print(f'part, {part}') | |
| while do_again: | |
| # Check to see if the part is a pynyin that starts with one of the consonants that can be split away | |
| split_symbs = self.split_pinyin(part) | |
| # print(f'split_symbs, {split_symbs}') | |
| do_again = False | |
| if len(split_symbs)>1: | |
| # A split happened. Add the first split-pinyin into the list... | |
| final_parts.append(split_symbs[0]) | |
| # ... then check if the second half of the split starts with one of the "ending" pinyin phonemes | |
| second_half = split_symbs[1] | |
| for phone in PIN_YIN_ENDS: | |
| if second_half.startswith(phone): | |
| final_parts.append(phone) | |
| second_half = second_half[len(phone):] | |
| if len(second_half): | |
| do_again = True | |
| break | |
| # Check to see if the leftover starts with one of the pinyin to arpabet mappings | |
| for phone_key in pinyin_to_arpabet_mappings.keys(): | |
| if second_half.startswith(phone_key): | |
| final_parts.append(pinyin_to_arpabet_mappings[phone_key]) | |
| second_half = second_half[len(pinyin_to_arpabet_mappings[phone_key]):] | |
| if len(second_half): | |
| do_again = True | |
| break | |
| part = second_half | |
| else: | |
| # If the part wasn't split up, then check if it starts with a "split" pinyin symbol, but not with the splitting consonants | |
| for phone in PIN_YIN_ENDS: | |
| if part.startswith(phone): | |
| # Starts with an "ending" phoneme, so add to the list and remove from the part | |
| final_parts.append(phone) | |
| part = part[len(phone):] | |
| if len(part): | |
| # Repeat the whole thing, if there's still any left-over stuff | |
| do_again = True | |
| break | |
| # Check to see if the leftover starts with one of the pinyin to arpabet mappings | |
| for phone_key in pinyin_to_arpabet_mappings.keys(): | |
| if part.startswith(phone_key): | |
| # Starts with a replacement phone, so add to the list and remove from the part | |
| final_parts.append(pinyin_to_arpabet_mappings[phone_key]) | |
| part = part[len(pinyin_to_arpabet_mappings[phone_key]):] | |
| if len(part): | |
| # Repeat the whole thing, if there's still any left-over stuff | |
| do_again = True | |
| break | |
| # print(f'part, {part}') | |
| if len(part): | |
| final_parts.append(part) | |
| # print(f'final_parts, {final_parts}') | |
| # self.logger.info(f'preprocess_pinyin final_parts: {final_parts}') | |
| all_split_are_pinyin = True | |
| final_parts_post = [] | |
| for split in final_parts: | |
| if split in pinyin_to_arpabet_mappings.keys(): | |
| # self.logger.info(f'preprocess_pinyin changing split from: {split} to {pinyin_to_arpabet_mappings[split]}') | |
| split = pinyin_to_arpabet_mappings[split] | |
| # if split=="J": | |
| # split = "JH" | |
| if split in ALL_SYMBOLS: | |
| final_parts_post.append(split) | |
| else: | |
| if split+"5" in ALL_SYMBOLS: | |
| final_parts_post.append(split+"5") | |
| else: | |
| all_split_are_pinyin = False | |
| # self.logger.info(f'preprocess_pinyin final_parts_post: {final_parts_post}') | |
| if all_split_are_pinyin: | |
| # text_final.append("{"+" ".join(final_parts)+"}") | |
| text_final.append("{"+" ".join(final_parts_post)+"}") | |
| else: | |
| text_final.append(part) | |
| # print(f'text_final, {text_final}') | |
| final_out.append(" ".join(text_final)) | |
| # self.logger.info(f'preprocess_pinyin final_out: {final_out}') | |
| text = " ".join(final_out) | |
| # self.logger.info(f'preprocess_pinyin return text: {text}') | |
| return text | |
| def text_to_phonemes (self, text): | |
| # print(f'text_to_phonemes, {text}') | |
| text = self.collapse_whitespace(text).replace(" }", "}").replace("{ ", "{") | |
| text = self.preprocess_pinyin(text) | |
| # text = self.clean_numbers(text) | |
| # print(f'clean_numbers: |{text}|') | |
| # text = self.clean_abbreviations(text) | |
| # print(f'clean_abbreviations: |{text}|') | |
| # text = self.dict_replace(text, customDicts=True) | |
| # print(f'dict_replace(custom): |{text}|') | |
| # text = self.detect_and_fill_heteronyms(text) | |
| # print(f'detect_and_fill_heteronyms: |{text}|') | |
| # text = self.dict_replace(text, customDicts=False) | |
| # print(f'dict_replace(built-in):, |{text}|') | |
| # text = self.clean_POS_and_subword_misses(text) | |
| # self.logger.info(f'clean_POS_and_subword_misses: |{text}|') | |
| text = self.fill_missing_via_g2p_zh(text) | |
| # self.logger.info(f'1 text: {text}') | |
| # text = self.en_processor.text_to_phonemes(text) | |
| # self.logger.info(f'2 text: {text}') | |
| # print(f'fill_missing_via_g2p: |{text}|') | |
| return text | |
| def text_to_sequence (self, text): | |
| orig_text = text | |
| text = self.collapse_whitespace(text) # Get rid of duplicate/padding spaces | |
| text = text.replace("!", "!").replace("?", "?").replace(",", ",").replace("。", ",").replace("…", "...").replace(")", "").replace("(", "")\ | |
| .replace("、", ",").replace("“", ",").replace("”", ",").replace(":", ":") | |
| text = self.text_to_phonemes(text) # Get 100% phonemes from the text | |
| # if self.logger is not None: | |
| # self.logger.info(f'1 text: {text}') | |
| # text = self.en_processor.text_to_phonemes(text) | |
| # self.logger.info(f'2 text: {text}') | |
| phonemes = self.collapse_whitespace(text).strip().split(" ") | |
| # self.logger.info(f'1 phonemes: {phonemes}') | |
| sequence = [] | |
| for pi,phone in enumerate(phonemes): | |
| phone = phone.replace(":","").strip() | |
| if len(phone): | |
| try: | |
| sequence.append(ALL_SYMBOLS.index(phone)) | |
| except: | |
| if phone in pinyin_to_arpabet_mappings.keys(): | |
| sequence.append(ALL_SYMBOLS.index(pinyin_to_arpabet_mappings[phone])) | |
| else: | |
| if phone not in ["5"]: | |
| self.TEMP_unhandled.append(f'{orig_text}: {phone}') | |
| # with open(f'F:/Speech/xVA-Synth/python/xvapitch/text/DEBUG.txt', "w+") as f: | |
| # f.write("\n".join(self.TEMP_unhandled)) | |
| # Add a space character between each symbol | |
| # if pi is not len(phonemes)-1: | |
| # sequence.append(ALL_SYMBOLS.index("_")) | |
| # Intersperse blank symbol if required | |
| if self.add_blank: | |
| sequence_ = [] | |
| for si,symb in enumerate(sequence): | |
| sequence_.append(symb) | |
| if si<len(sequence)-1: | |
| sequence_.append(len(ALL_SYMBOLS)-1) | |
| sequence = sequence_ | |
| cleaned_text = "|".join([ALL_SYMBOLS[index] for index in sequence]) | |
| return sequence, cleaned_text | |
| def get_text_preprocessor(code, base_dir, logger=None, override_useAnyG2P=None): | |
| tp_codes = { | |
| "am": { | |
| "name": "Amharic", | |
| "tp": AmharicTextPreprocessor, | |
| "dicts": [], | |
| "custom_dicts": [], | |
| "use_g2p": False, | |
| "use_epitran": True, | |
| "g2p_cache": [f'{base_dir}/g2p_cache/epitran/epitran_cache_am.txt'] | |
| }, | |
| "ar": { | |
| "name": "Arabic", | |
| "tp": ArabicTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/arabic.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_ar.txt'] | |
| }, | |
| "da": { | |
| "name": "Danish", | |
| "tp": DanishTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/danish.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_da.txt'] | |
| }, | |
| "de": { | |
| "name": "German", | |
| "tp": GermanTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/german.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_de.txt'] | |
| }, | |
| "el": { | |
| "name": "Greek", | |
| "tp": GreekTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/greek.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_el.txt'] | |
| }, | |
| "en": { | |
| "name": "English", | |
| "tp": EnglishTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/cmudict.txt'], | |
| "custom_dicts": glob.glob(f'{base_dir}/../../../arpabet/*.json'), | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_en.txt'] | |
| }, | |
| "es": { | |
| "name": "Spanish", | |
| "tp": SpanishTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/spanish.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_es.txt'] | |
| }, | |
| "fi": { | |
| "name": "Finnish", | |
| "tp": FinnishTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/finnish.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_fi.txt'] | |
| }, | |
| "fr": { | |
| "name": "French", | |
| "tp": FrenchTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/french.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_fr.txt'] | |
| }, | |
| "ha": { | |
| "name": "Hausa", | |
| "tp": HausaTextPreprocessor, | |
| # "dicts": [f'{base_dir}/dicts/hausa.txt'], | |
| "dicts": [], | |
| "custom_dicts": [], | |
| "use_g2p": False, | |
| "use_epitran": True, | |
| "g2p_cache": [f'{base_dir}/g2p_cache/epitran/epitran_cache_ha.txt'] | |
| }, | |
| "hi": { | |
| "name": "Hindi", | |
| "tp": HindiTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/hindi.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_hi.txt'] | |
| }, | |
| "hu": { | |
| "name": "Hungarian", | |
| "tp": HungarianTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/hungarian.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_hu.txt'] | |
| }, | |
| "it": { | |
| "name": "Italian", | |
| "tp": ItalianTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/italian.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_it.txt'] | |
| }, | |
| "jp": { | |
| "name": "Japanese", | |
| "tp": JapaneseTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/japanese.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_jp.txt'] | |
| }, | |
| "ko": { | |
| "name": "Korean", | |
| "tp": KoreanTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/korean.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_ko.txt'] | |
| }, | |
| "la": { | |
| "name": "Latin", | |
| "tp": LatinTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/latin.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_la.txt'] | |
| }, | |
| "mn": { | |
| "name": "Mongolian", | |
| "tp": MongolianTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/mongolian.txt'], | |
| "custom_dicts": [], | |
| "use_epitran": True, | |
| "g2p_cache": [f'{base_dir}/g2p_cache/epitran/epitran_cache_mn.txt'] | |
| }, | |
| "nl": { | |
| "name": "Dutch", | |
| "tp": DutchTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/dutch.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_nl.txt'] | |
| }, | |
| "pl": { | |
| "name": "Polish", | |
| "tp": PolishTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/polish.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_pl.txt'] | |
| }, | |
| "pt": { | |
| "name": "Portuguese", | |
| "tp": PortugueseTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/portuguese_br.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_pt.txt'] | |
| }, | |
| "ro": { | |
| "name": "Romanian", | |
| "tp": RomanianTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/romanian.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_ro.txt'] | |
| }, | |
| "ru": { | |
| "name": "Russian", | |
| "tp": RussianTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/russian.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_ru.txt'] | |
| }, | |
| "sv": { | |
| "name": "Swedish", | |
| "tp": SwedishTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/swedish.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_sv.txt'] | |
| }, | |
| "sw": { | |
| "name": "Swahili", | |
| "tp": SwahiliTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/swahili.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_sw.txt'] | |
| }, | |
| "th": { | |
| "name": "Thai", | |
| "tp": ThaiTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/thai.txt'], | |
| "custom_dicts": [], | |
| # "use_g2p": F | |
| # "use_g2p": False, | |
| # "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_th.txt'] | |
| "g2p_cache": [f'{base_dir}/g2p_cache/epitran/epitran_cache_th.txt'] | |
| }, | |
| "tr": { | |
| "name": "Turkish", | |
| "tp": TurkishTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/turkish.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_tr.txt'] | |
| }, | |
| "uk": { | |
| "name": "Ukrainian", | |
| "tp": UkrainianTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/ukrainian.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_uk.txt'] | |
| }, | |
| "vi": { | |
| "name": "Vietnamese", | |
| "tp": VietnameseTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/vietnamese.txt'], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/espeak/espeak_cache_vi.txt'] | |
| }, | |
| "wo": { | |
| "name": "Wolof", | |
| "tp": WolofTextPreprocessor, | |
| # "dicts": [f'{base_dir}/dicts/wolof.txt'], | |
| "dicts": [], | |
| "custom_dicts": [], | |
| "g2p_cache": [f'{base_dir}/g2p_cache/g2p_cache_wo.txt'] | |
| }, | |
| "yo": { | |
| "name": "Yoruba", | |
| "tp": YorubaTextPreprocessor, | |
| "dicts": [f'{base_dir}/dicts/yoruba.txt'], | |
| "custom_dicts": [], | |
| "use_epitran": True, | |
| "g2p_cache": [f'{base_dir}/g2p_cache/epitran/epitran_cache_yo.txt'] | |
| }, | |
| "zh": { | |
| "name": "Chinese", | |
| "tp": ChineseTextPreprocessor, | |
| "dicts": [], | |
| "custom_dicts": [], | |
| # "use_g2p": False, | |
| "use_g2p": True, | |
| "g2p_cache": [f'{base_dir}/g2p_cache/g2pc_cache_zh.txt'] | |
| }, | |
| } | |
| use_g2p = tp_codes[code]["use_g2p"] if "use_g2p" in tp_codes[code].keys() else True | |
| # print(f'override_useAnyG2P, {override_useAnyG2P}') | |
| if override_useAnyG2P is False: | |
| use_g2p = override_useAnyG2P | |
| tp_codes[code]["use_epitran"] = override_useAnyG2P | |
| tp_codes[code]["use_g2p"] = override_useAnyG2P | |
| # print(f'tp_codes[code]["use_epitran"], {tp_codes[code]["use_epitran"]}') | |
| tp = tp_codes[code]["tp"](base_dir, logger=logger, use_g2p=use_g2p, use_epitran=tp_codes[code]["use_epitran"] if "use_epitran" in tp_codes[code].keys() else None) | |
| for builtin_dict in tp_codes[code]["dicts"]: | |
| tp.load_dict(builtin_dict) | |
| for custom_dict in tp_codes[code]["custom_dicts"]: | |
| tp.load_dict(custom_dict, isCustom=True) | |
| if len(tp_codes[code]["g2p_cache"]): | |
| tp.load_g2p_cache(tp_codes[code]["g2p_cache"][0]) | |
| return tp | |
| if __name__ == '__main__': | |
| import os | |
| base_dir = "/".join(os.path.abspath(__file__).split("\\")[:-1]) | |
| # tp = RomanianTextPreprocessor(base_dir) | |
| # tp = ItalianTextPreprocessor(base_dir) | |
| # tp = GermanTextPreprocessor(base_dir) | |
| # tp = FrenchTextPreprocessor(base_dir) | |
| # tp = ArabicTextPreprocessor(base_dir) | |
| tp = get_text_preprocessor("jp", base_dir) | |
| # line = "Un test la 10 cuvinte" | |
| # line = "ein Testsatz mit 10 Wörtern" | |
| # line = "une phrase test de 10 mots" | |
| # line = "جملة اختبارية من 10 كلمات" | |
| # line = "かな漢字" | |
| # line = "10語の日本語文" | |
| # line = "aa a a " | |
| # line = "aa a baal rebb ceeb sàcc " | |
| line = "これしきで戦闘不能か…ひ弱なものだな。" | |
| # line = "これしきで せんとうふのう か…ひ じゃく なものだな。" | |
| line = "これ式で戦闘不能か費はなものだな." | |
| line = "これ しき で せんとうふのう か ひ はなものだな." | |
| # tp.espeak | |
| # print(f'tp.espeak, {tp.espeak}') | |
| # print(f'tp.espeak, {tp.espeak.supported_languages(base_dir)}') | |
| # # {'af': 'afrikaans-mbrola-1', 'am': 'Amharic', 'an': 'Aragonese', 'ar': 'Arabic', 'as': 'Assamese', 'az': 'Azerbaijani', 'ba': 'Bashkir', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali', 'bpy': 'Bishnupriya_Manipuri', 'bs': 'Bosnian', 'ca': 'Catalan', 'chr-US-Qaaa-x-west': 'Cherokee_', 'cmn': 'Chinese_(Mandarin,_latin_as_English)', 'cmn-latn-pinyin': 'Chinese_(Mandarin,_latin_as_Pinyin)', 'cs': 'Czech', 'cv': 'Chuvash', 'cy': 'Welsh', 'da': 'Danish', 'de': 'german-mbrola-8', 'el': 'greek-mbrola-1', 'en': 'en-swedish', 'en-029': 'English_(Caribbean)', 'en-gb': 'English_(Great_Britain)', 'en-gb-scotland': 'English_(Scotland)', 'en-gb-x-gbclan': 'English_(Lancaster)', 'en-gb-x-gbcwmd': 'English_(West_Midlands)', 'en-gb-x-rp': 'English_(Received_Pronunciation)', 'en-uk': 'english-mb-en1', 'en-us': 'us-mbrola-3', 'en-us-nyc': 'English_(America,_New_York_City)', 'eo': 'Esperanto', 'es': 'Spanish_(Spain)', 'es-419': 'Spanish_(Latin_America)', 'es-es': 'spanish-mbrola-2', 'es-mx': 'mexican-mbrola-2', 'es-vz': 'venezuala-mbrola-1', 'et': 'estonian-mbrola-1', 'eu': 'Basque', 'fa': 'persian-mb-ir1', 'fa-latn': 'Persian_(Pinglish)', 'fi': 'Finnish', 'fr': 'french-mbrola-7', 'fr-be': 'french-mbrola-5', 'fr-ca': 'fr-canadian-mbrola-2', 'fr-ch': 'French_(Switzerland)', 'fr-fr': 'french-mbrola-6', 'ga': 'Gaelic_(Irish)', 'gd': 'Gaelic_(Scottish)', 'gn': 'Guarani', 'grc': 'german-mbrola-6', 'gu': 'Gujarati', 'hak': 'Hakka_Chinese', 'haw': 'Hawaiian', 'he': 'hebrew-mbrola-2', 'hi': 'Hindi', 'hr': 'croatian-mbrola-1', 'ht': 'Haitian_Creole', 'hu': 'hungarian-mbrola-1', 'hy': 'Armenian_(East_Armenia)', 'hyw': 'Armenian_(West_Armenia)', 'ia': 'Interlingua', 'id': 'indonesian-mbrola-1', 'io': 'Ido', 'is': 'icelandic-mbrola-1', 'it': 'italian-mbrola-2', 'ja': 'Japanese', 'jbo': 'Lojban', 'ka': 'Georgian', 'kk': 'Kazakh', 'kl': 'Greenlandic', 'kn': 'Kannada', 'ko': 'Korean', 'kok': 'Konkani', 'ku': 'Kurdish', 'ky': 'Kyrgyz', 'la': 'latin-mbrola-1', 'lb': 'Luxembourgish', 'lfn': 'Lingua_Franca_Nova', 'lt': 'lithuanian-mbrola-2', 'ltg': 'Latgalian', 'lv': 'Latvian', 'mi': 'maori-mbrola-1', 'mk': 'Macedonian', 'ml': 'Malayalam', 'mr': 'Marathi', 'ms': 'Malay', 'mt': 'Maltese', 'my': 'Myanmar_(Burmese)', 'nb': 'Norwegian_Bokmål', 'nci': 'Nahuatl_(Classical)', 'ne': 'Nepali', 'nl': 'dutch-mbrola-3', 'nog': 'Nogai', 'om': 'Oromo', 'or': 'Oriya', 'pa': 'Punjabi', 'pap': 'Papiamento', 'piqd': 'Klingon', 'pl': 'polish-mbrola-1', 'pt': 'Portuguese_(Portugal)', 'pt-br': 'brazil-mbrola-4', 'pt-pt': 'portugal-mbrola-1', 'py': 'Pyash', 'qdb': 'Lang_Belta', 'qu': 'Quechua', 'quc': "K'iche'", 'qya': 'Quenya', 'ro': 'romanian-mbrola-1', 'ru': 'Russian', 'ru-lv': 'Russian_(Latvia)', 'sd': 'Sindhi', 'shn': 'Shan_(Tai_Yai)', 'si': 'Sinhala', 'sjn': 'Sindarin', 'sk': 'Slovak', 'sl': 'Slovenian', 'smj': 'Lule_Saami', 'sq': 'Albanian', 'sr': 'Serbian', 'sv': 'swedish-mbrola-2', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'telugu-mbrola-1', 'th': 'Thai', 'tk': 'Turkmen', 'tn': 'Setswana', 'tr': 'turkish-mbrola-1', 'tt': 'Tatar', 'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'uz': 'Uzbek', 'vi': 'Vietnamese_(Northern)', 'vi-vn-x-central': 'Vietnamese_(Central)', 'vi-vn-x-south': 'Vietnamese_(Southern)', 'yue': 'Chinese_(Cantonese,_latin_as_Jyutping)', 'zh': 'chinese-mb-cn1'} | |
| # fdfgd() | |
| # kks = pykakasi.kakasi() | |
| # line = kks.convert(line) | |
| # line = " ".join([part["hira"] for part in line]) | |
| print(f'line, {line}') | |
| print(f'Line: |{line}|') | |
| phonemes = tp.text_to_phonemes(line) | |
| print(f'xVAARPAbet: |{phonemes}|') | |
| ssd() | |
| if __name__ == '__main__': | |
| base_dir = "/".join(os.path.abspath(__file__).split("\\")[:-1]) | |
| tp = get_text_preprocessor("en", base_dir) | |
| with open("F:/Speech/custom-arpabets/elderscrolls-missing-post.txt") as f: | |
| words = f.read().split("\n") | |
| metadata_out = ["game_id|voice_id|text,out_path"] | |
| txt_out = [] | |
| for word in words: | |
| if len(word.strip())>2: | |
| phones = tp.text_to_phonemes(word) | |
| print(f'word, {word}') | |
| print(f'phones, {phones}') | |
| metadata_out.append(f'skyrim|sk_femaleeventoned|This is what '+"{" + phones +"}"+f' sounds like.|./{word}.wav') | |
| txt_out.append(f'{word}|{phones}') | |
| with open(f'./g2p_batch.csv', "w+") as f: | |
| f.write("\n".join(metadata_out)) | |
| with open(f'./txt_out.csv', "w+") as f: | |
| f.write("\n".join(txt_out)) | |
| fddfg() | |
| if __name__ == '__main__': | |
| base_dir = "/".join(os.path.abspath(__file__).split("\\")[:-1]) | |
| # tp = get_text_preprocessor("th", base_dir) | |
| # tp = get_text_preprocessor("mn", base_dir) | |
| tp = get_text_preprocessor("wo", base_dir) | |
| # print(tp.text_to_phonemes("นี่คือประโยคภาษาไทยที่พูดโดย xVASynth ประมาณนี้ค่ะ")) | |
| # print(tp.text_to_phonemes("Энэ бол {EH1 G S V EY0 EY0 IH0 S IH0 N TH}-ийн ярьдаг монгол хэл дээрх өгүүлбэр юм. ")) | |
| print(tp.text_to_phonemes(" Kii est ab baat ci wolof, janga par xvasynth ")) | |
| fddfg() | |
| if __name__ == '__main__': | |
| base_dir = "/".join(os.path.abspath(__file__).split("\\")[:-1]) | |
| tp = get_text_preprocessor("ha", base_dir) | |
| print(tp.text_to_phonemes("Wannan jimla ce a cikin hausa, xVASynth ta yi magana ")) | |
| fddfg() | |
| # if __name__ == '__main__': | |
| if False: | |
| print("Mass pre-caching g2p") | |
| def get_datasets (root_f): | |
| data_folders = os.listdir(root_f) | |
| data_folders = [f'{root_f}/{dataset_folder}' for dataset_folder in sorted(data_folders) if not dataset_folder.startswith("_") and "." not in dataset_folder] | |
| return data_folders | |
| base_dir = "/".join(os.path.abspath(__file__).split("\\")[:-1]) | |
| # all_data_folders = get_datasets(f'D:/xVASpeech/DATASETS')+get_datasets(f'D:/xVASpeech/GAME_DATA') | |
| all_data_folders = get_datasets(f'D:/xVASpeech/GAME_DATA') | |
| for dfi,dataset_folder in enumerate(all_data_folders): | |
| lang = dataset_folder.split("/")[-1].split("_")[0] | |
| if "de_f4" in dataset_folder: | |
| continue | |
| # if lang not in ["zh"]: | |
| # continue | |
| # if lang in ["am", "sw"]: | |
| # continue # Skip currently running training | |
| tp = get_text_preprocessor(lang, base_dir) | |
| with open(f'{dataset_folder}/metadata.csv') as f: | |
| lines = f.read().split("\n") | |
| for li,line in enumerate(lines): | |
| print(f'\r{dfi+1}/{len(all_data_folders)} | {li+1}/{len(lines)} | {dataset_folder} ', end="", flush=True) | |
| if "|" in line: | |
| text = line.split("|")[1] | |
| if len(text): | |
| tp.text_to_phonemes(text) | |
| print("") | |
| fsdf() | |
| # kks = pykakasi.kakasi() | |
| # pron_dict = {} | |
| # # with open(f'F:/Speech/xva-trainer/python/xvapitch/text_prep/dicts/japanese.txt') as f: | |
| # with open(f'F:/Speech/xVA-Synth/python/xvapitch/text/dicts/japanese.txt') as f: | |
| # lines = f.read().split("\n") | |
| # for li,line in enumerate(lines): | |
| # print(f'\r{li+1}/{len(lines)}', end="", flush=True) | |
| # if len(line.strip()): | |
| # word = line.split(" ")[0] | |
| # phon = " ".join(line.split(" ")[1:]) | |
| # word = kks.convert(word) | |
| # word = "".join([part["hira"] for part in word]) | |
| # # word = word.replace(" ", "").replace(" ", "") | |
| # pron_dict[word] = phon | |
| # csv_out = [] | |
| # for key in pron_dict.keys(): | |
| # csv_out.append(f'{key} {pron_dict[key]}') | |
| # with open(f'F:/Speech/xva-trainer/python/xvapitch/text_prep/dicts/japanese_h.txt', "w+") as f: | |
| # f.write("\n".join(csv_out)) | |
| if False: | |
| tp = ChineseTextPreprocessor(base_dir) | |
| # tp.load_g2p_cache(f'F:/Speech/xva-trainer/python/xvapitch/text_prep/g2p_cache/g2pc_cache_zh.txt') | |
| line = "你好。 这就是 xVASynth 声音的样子。" | |
| line = "遛弯儿都得躲远点。" | |
| # line = "Nǐ hǎo" | |
| # line = "Zhè shì yīgè jiào zhǎng de jùzi. Wǒ xīwàng tā shì zhèngquè de, yīnwèi wǒ zhèngzài shǐyòng gǔgē fānyì tā" | |
| # phones = tp.text_to_phonemes(line) | |
| # print(f'phones, |{phones}|') | |
| phones = tp.text_to_sequence(line) | |
| print(f'phones, |{phones[1]}|') | |
| print("start setup...") | |
| text = [] | |
| # text.append("nords") | |
| # text.append("I read the book... It was a good book to read?{T EH S T}! Test dovahkiin word") | |
| # text.append(" I read the book... It was a good book to read?{T EH S T}! Test dovahkiin word") | |
| # text.append("{AY1 } read the book... It was a good book to read?{T EH S T}! Test 1 dovahkiin word") | |
| text.append(" {AY1 } read the book... It was a good book to read?{T EH S T}! Test 1 dovahkiin word ") | |
| # text.append("the scaffold hung with black; and the inhabitants of the neighborhood, having petitioned the sheriffs to remove the scene of execution to the old place,") | |
| text.append("oxenfurt") | |
| text.append("atomatoys") | |
| import os | |
| base_dir = "/".join(os.path.abspath(__file__).split("\\")[:-1]) | |
| print(f'base_dir, {base_dir}') | |
| tp = EnglishTextPreprocessor(base_dir) | |
| tp.load_dict(f'F:/Speech/xva-trainer/python/xvapitch/text_prep/dicts/cmudict.txt') | |
| tp.load_dict(f'F:/Speech/xVA-Synth/arpabet/xvadict-elder_scrolls.json', isCustom=True) | |
| # tp.load_g2p_cache(f'F:/Speech/xva-trainer/python/xvapitch/text_prep/g2p_cache/espeak/espeak_cache_en.txt') | |
| print("start inferring...") | |
| for line in text: | |
| print(f'Line: |{line}|') | |
| phonemes = tp.text_to_phonemes(line) | |
| print(f'xVAARPAbet: |{phonemes}|') | |
| # TODO | |
| # - Add the POS, and extra cleaning stuff | |