Spaces:
Build error
Build error
from TTS.tts.models.vits import Vits | |
from TTS.tts.configs.vits_config import VitsConfig | |
import numpy as np | |
import unicodedata | |
import regex | |
num_re = regex.compile(r"([0-9.,]*[0-9])") | |
digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"] | |
def read_number(num: str) -> str: | |
"""Translate numeric text into written form | |
Args: num (str) numeric text | |
Returns: (str) written form of num | |
""" | |
if len(num) == 1: | |
return digits[int(num)] | |
elif len(num) == 2 and num.isdigit(): | |
n = int(num) | |
end = digits[n % 10] | |
if n == 10: | |
return "mười" | |
if n % 10 == 5: | |
end = "lăm" | |
if n % 10 == 0: | |
return digits[n // 10] + " mươi" | |
elif n < 20: | |
return "mười " + end | |
else: | |
if n % 10 == 1: | |
end = "mốt" | |
return digits[n // 10] + " mươi " + end | |
elif len(num) == 3 and num.isdigit(): | |
n = int(num) | |
if n % 100 == 0: | |
return digits[n // 100] + " trăm" | |
elif num[1] == "0": | |
return digits[n // 100] + " trăm lẻ " + digits[n % 100] | |
else: | |
return digits[n // 100] + " trăm " + read_number(num[1:]) | |
elif 4 <= len(num) <= 6 and num.isdigit(): | |
n = int(num) | |
n1 = n // 1000 | |
return read_number(str(n1)) + " ngàn " + read_number(num[-3:]) | |
elif "," in num: | |
n1, n2 = num.split(",") | |
return read_number(n1) + " phẩy " + read_number(n2) | |
elif "." in num: | |
parts = num.split(".") | |
if len(parts) == 2: | |
if parts[1] == "000": | |
return read_number(parts[0]) + " ngàn" | |
elif parts[1].startswith("00"): | |
end = digits[int(parts[1][2:])] | |
return read_number(parts[0]) + " ngàn lẻ " + end | |
else: | |
return read_number(parts[0]) + " ngàn " + read_number(parts[1]) | |
elif len(parts) == 3: | |
return ( | |
read_number(parts[0]) | |
+ " triệu " | |
+ read_number(parts[1]) | |
+ " ngàn " | |
+ read_number(parts[2]) | |
) | |
return num | |
def load_model(): | |
config = VitsConfig() | |
config.load_json("vits/config.json") | |
vits = Vits.init_from_config(config) | |
vits.load_onnx("vits/coqui_vits.onnx") | |
text = "xin chào tôi là hoàng đây" | |
text_inputs = np.asarray( | |
vits.tokenizer.text_to_ids(text), | |
dtype=np.int64, | |
)[None, :] | |
audio = vits.inference_onnx(text_inputs) | |
return vits | |
def normalize_text(text): | |
"""Normalize the input text | |
Args: text (str) the input text | |
Returns: text (str) the normalized text | |
""" | |
# lowercase | |
text = text.lower() | |
# unicode normalize | |
text = unicodedata.normalize("NFKC", text) | |
text = text.replace(".", "") | |
text = text.replace(",", "") | |
text = text.replace(";", "") | |
text = text.replace(":", "") | |
text = text.replace("!", "") | |
text = text.replace("?", "") | |
text = text.replace("(", "") | |
# Convert numeric text into written form | |
text = num_re.sub(r" \1 ", text) | |
words = text.split() | |
words = [read_number(w) if num_re.fullmatch(w) else w for w in words] | |
text = " ".join(words) | |
return text | |