maskgct / models /tts /valle_v2 /g2p_processor.py
Hecheng0625's picture
Upload 409 files
c968fc3 verified
raw
history blame
6.66 kB
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import json
import numpy as np
import os
import torch
import copy
from g2p_en import G2p
import re
import unicodedata
from g2p_en import G2p
from g2p_en.expand import normalize_numbers
g2p = G2p()
PHONE_SET = [
"!",
",",
".",
".B",
":",
"<BOS>",
"<EOS>",
"<PAD>",
"<UNK>",
"?",
"AA0B",
"AA0E",
"AA0I",
"AA1B",
"AA1E",
"AA1I",
"AA2B",
"AA2E",
"AA2I",
"AE0B",
"AE0E",
"AE0I",
"AE1B",
"AE1E",
"AE1I",
"AE2B",
"AE2E",
"AE2I",
"AH0B",
"AH0E",
"AH0I",
"AH1B",
"AH1E",
"AH1I",
"AH2B",
"AH2E",
"AH2I",
"AO0B",
"AO0E",
"AO0I",
"AO1",
"AO1B",
"AO1E",
"AO1I",
"AO2B",
"AO2E",
"AO2I",
"AW0B",
"AW0E",
"AW0I",
"AW1B",
"AW1E",
"AW1I",
"AW2B",
"AW2E",
"AW2I",
"AY0B",
"AY0E",
"AY0I",
"AY1B",
"AY1E",
"AY1I",
"AY2B",
"AY2E",
"AY2I",
"BB",
"BE",
"BI",
"CHB",
"CHE",
"CHI",
"DB",
"DE",
"DHB",
"DHE",
"DHI",
"DI",
"EH0B",
"EH0E",
"EH0I",
"EH1B",
"EH1E",
"EH1I",
"EH2B",
"EH2E",
"EH2I",
"ER0B",
"ER0E",
"ER0I",
"ER1B",
"ER1E",
"ER1I",
"ER2B",
"ER2E",
"ER2I",
"EY0B",
"EY0E",
"EY0I",
"EY1B",
"EY1E",
"EY1I",
"EY2B",
"EY2E",
"EY2I",
"FB",
"FE",
"FI",
"GB",
"GE",
"GI",
"HHB",
"HHE",
"HHI",
"IH0B",
"IH0E",
"IH0I",
"IH1B",
"IH1E",
"IH1I",
"IH2B",
"IH2E",
"IH2I",
"IY0B",
"IY0E",
"IY0I",
"IY1B",
"IY1E",
"IY1I",
"IY2B",
"IY2E",
"IY2I",
"JHB",
"JHE",
"JHI",
"KB",
"KE",
"KI",
"L",
"LB",
"LE",
"LI",
"MB",
"ME",
"MI",
"NB",
"NE",
"NGB",
"NGE",
"NGI",
"NI",
"OW0B",
"OW0E",
"OW0I",
"OW1B",
"OW1E",
"OW1I",
"OW2B",
"OW2E",
"OW2I",
"OY0B",
"OY0E",
"OY0I",
"OY1B",
"OY1E",
"OY1I",
"OY2B",
"OY2E",
"OY2I",
"PB",
"PE",
"PI",
"RB",
"RE",
"RI",
"SB",
"SE",
"SHB",
"SHE",
"SHI",
"SI",
"TB",
"TE",
"THB",
"THE",
"THI",
"TI",
"UH0B",
"UH0E",
"UH0I",
"UH1B",
"UH2B",
"UH1E",
"UH1I",
"UH2E",
"UH2I",
"UW0B",
"UW0E",
"UW0I",
"UW1B",
"UW1E",
"UW1I",
"UW2B",
"UW2E",
"UW2I",
"VB",
"VE",
"VI",
"WB",
"WE",
"WI",
"YB",
"YE",
"YI",
"ZB",
"ZE",
"ZHB",
"ZHE",
"ZHI",
"ZI",
"|",
]
PHPONE2ID = {PHONE_SET[i]: i for i in range(len(PHONE_SET))}
PUNCS = "!,.?;:"
def is_sil_phoneme(p):
return p == "" or not p[0].isalpha()
def add_bdr(txt_struct):
txt_struct_ = []
for i, ts in enumerate(txt_struct):
txt_struct_.append(ts)
if (
i != len(txt_struct) - 1
and not is_sil_phoneme(txt_struct[i][0])
and not is_sil_phoneme(txt_struct[i + 1][0])
):
txt_struct_.append(["|", ["|"]])
return txt_struct_
def preprocess_text(text):
text = normalize_numbers(text)
text = "".join(
char
for char in unicodedata.normalize("NFD", text)
if unicodedata.category(char) != "Mn"
) # Strip accents
text = text.lower()
text = re.sub("['\"()]+", "", text)
text = re.sub("[-]+", " ", text)
text = re.sub(f"[^ a-z{PUNCS}]", "", text)
text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
text = text.replace("i.e.", "that is")
text = text.replace("i.e.", "that is")
text = text.replace("etc.", "etc")
text = re.sub(f"([{PUNCS}])", r" ", text) # remove punctuations for now
text = re.sub(rf"\s+", r" ", text)
return text
def postprocess(txt_struct):
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
txt_struct = txt_struct[1:]
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
txt_struct = txt_struct[:-1]
txt_struct = add_bdr(txt_struct)
txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
return txt_struct
def process(txt, g2p):
txt = preprocess_text(txt).strip()
phs = g2p(txt)
txt_struct = [[w, []] for w in txt.split(" ")]
i_word = 0
for p in phs:
if p == " ":
i_word += 1
else:
txt_struct[i_word][1].append(p)
txt_struct_ret = copy.deepcopy(txt_struct)
for i_word in range(len(txt_struct)):
if not is_sil_phoneme(txt_struct[i_word][0]):
if len(txt_struct[i_word][1]) > 1:
txt_struct_ret[i_word][1][0] += "B"
for i in range(1, len(txt_struct[i_word][1]) - 1):
txt_struct_ret[i_word][1][i] += "I"
txt_struct_ret[i_word][1][-1] += "E"
else:
txt_struct_ret[i_word][1][0] += "B"
txt_struct_ret = postprocess(txt_struct_ret)
return txt_struct_ret, txt
def test():
g2p = G2p()
txt = "This is a test sentence."
txt_struct, txt = process(txt, g2p)
print(txt_struct)
print(txt)
phone_seq = [p for w in txt_struct for p in w[1]]
print(phone_seq)
phone_id = [PHPONE2ID[p] for p in phone_seq]
print(phone_id)
class G2pProcessor:
def __init__(self):
self.g2p = G2p()
def __call__(self, txt, lang="en"):
return self.txt2phoneid(txt)
def txt2phoneid(self, txt):
txt_struct, txt = process(txt, self.g2p)
phone_seq = [p for w in txt_struct for p in w[1]]
phone_id = [PHPONE2ID[p] for p in phone_seq]
return None, phone_id
def phoneid2txt(self, phone_id):
txt = []
for i in phone_id:
txt.append(PHONE_SET[i])
return txt
if __name__ == "__main__":
g2p = G2pProcessor()
txt = "This is a test sentence."
phoneid = g2p.txt2phoneid(txt)[1]
# output: [5, 73, 118, 175, 218, 116, 213, 218, 28, 218, 180, 82, 179, 181, 218, 174, 82, 149, 185, 30, 149, 175, 6]
# print(phoneid)
print(g2p.phoneid2txt(phoneid))
# output: ['<BOS>', 'DHB', 'IH1I', 'SE', '|', 'IH1B', 'ZE', '|', 'AH0B', '|', 'TB', 'EH1I', 'SI', 'TE', '|', 'SB', 'EH1I', 'NI', 'TI', 'AH0I', 'NI', 'SE', '<EOS>']
print(len(PHONE_SET))
# output: 219