Spaces:
Running
Running
File size: 6,465 Bytes
c5ed230 d94ccbe c5ed230 d94ccbe c5ed230 7a58c1e c5ed230 36fb9b8 d94ccbe c5ed230 36fb9b8 c5ed230 5854014 c5ed230 d94ccbe c5ed230 36fb9b8 c5ed230 5854014 c5ed230 d94ccbe c5ed230 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import os
import json
import logging
import config
import numpy as np
from utils.utils import check_is_none
from voice import vits, TTS
lang_dict = {
"english_cleaners": ["en"],
"english_cleaners2": ["en"],
"japanese_cleaners": ["ja"],
"japanese_cleaners2": ["ja"],
"korean_cleaners": ["ko"],
"chinese_cleaners": ["zh"],
"zh_ja_mixture_cleaners": ["zh", "ja"],
"sanskrit_cleaners": ["sa"],
"cjks_cleaners": ["zh", "ja", "ko", "sa"],
"cjke_cleaners": ["zh", "ja", "ko", "en"],
"cjke_cleaners2": ["zh", "ja", "ko", "en"],
"cje_cleaners": ["zh", "ja", "en"],
"cje_cleaners2": ["zh", "ja", "en"],
"thai_cleaners": ["th"],
"shanghainese_cleaners": ["sh"],
"chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
"ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
"YB"],
"bert_chinese_cleaners": ["zh"],
}
def analysis(model_config_json):
model_config = json.load(model_config_json)
symbols = model_config.get("symbols", None)
emotion_embedding = model_config.get("data").get("emotion_embedding", False)
if symbols != None:
if not emotion_embedding:
mode_type = "vits"
else:
mode_type = "w2v2"
else:
mode_type = "hubert"
return mode_type
def load_npy(model_):
if isinstance(model_, list):
# check if is .npy
for i in model_:
_model_extention = os.path.splitext(i)[1]
if _model_extention != ".npy":
raise ValueError(f"Unsupported model type: {_model_extention}")
# merge npy files
emotion_reference = np.empty((0, 1024))
for i in model_:
tmp = np.load(i).reshape(-1, 1024)
emotion_reference = np.append(emotion_reference, tmp, axis=0)
elif os.path.isdir(model_):
emotion_reference = np.empty((0, 1024))
for root, dirs, files in os.walk(model_):
for file_name in files:
# check if is .npy
_model_extention = os.path.splitext(file_name)[1]
if _model_extention != ".npy":
continue
file_path = os.path.join(root, file_name)
# merge npy files
tmp = np.load(file_path).reshape(-1, 1024)
emotion_reference = np.append(emotion_reference, tmp, axis=0)
elif os.path.isfile(model_):
# check if is .npy
_model_extention = os.path.splitext(model_)[1]
if _model_extention != ".npy":
raise ValueError(f"Unsupported model type: {_model_extention}")
emotion_reference = np.load(model_)
logging.info(f"Loaded emotional dimention npy range:{len(emotion_reference)}")
return emotion_reference
def merge_model(merging_model):
vits_obj = []
vits_speakers = []
hubert_vits_obj = []
hubert_vits_speakers = []
w2v2_vits_obj = []
w2v2_vits_speakers = []
# model list
vits_list = []
hubert_vits_list = []
w2v2_vits_list = []
for l in merging_model:
with open(l[1], 'r', encoding='utf-8') as model_config:
model_type = analysis(model_config)
if model_type == "vits":
vits_list.append(l)
elif model_type == "hubert":
hubert_vits_list.append(l)
elif model_type == "w2v2":
w2v2_vits_list.append(l)
# merge vits
new_id = 0
for obj_id, i in enumerate(vits_list):
obj = vits(model=i[0], config=i[1], model_type="vits")
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
if isinstance(obj.get_speakers(), list):
for id, name in enumerate(obj.get_speakers()):
vits_obj.append([int(id), obj, obj_id])
vits_speakers.append({"id": new_id, "name": name, "lang": lang})
new_id += 1
else:
for id, (name, _) in enumerate(obj.get_speakers().items()):
vits_obj.append([int(id), obj, obj_id])
vits_speakers.append({"id": new_id, "name": name, "lang": lang})
new_id += 1
# merge hubert-vits
if len(hubert_vits_list) != 0:
if getattr(config, "HUBERT_SOFT_MODEL", None) == None or check_is_none(config.HUBERT_SOFT_MODEL):
raise ValueError(f"Please configure HUBERT_SOFT_MODEL path in config.py")
try:
from hubert_model import hubert_soft
hubert = hubert_soft(config.HUBERT_SOFT_MODEL)
except Exception as e:
raise ValueError(f"Load HUBERT_SOFT_MODEL failed {e}")
new_id = 0
for obj_id, i in enumerate(hubert_vits_list):
obj = vits(model=i[0], config=i[1], model_=hubert, model_type="hubert")
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
for id, name in enumerate(obj.get_speakers()):
hubert_vits_obj.append([int(id), obj, obj_id])
hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
new_id += 1
# merge w2v2-vits
emotion_reference = None
if len(w2v2_vits_list) != 0:
if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
try:
emotion_reference = load_npy(config.DIMENSIONAL_EMOTION_NPY)
except Exception as e:
raise ValueError(f"Load DIMENSIONAL_EMOTION_NPY failed {e}")
new_id = 0
for obj_id, i in enumerate(w2v2_vits_list):
obj = vits(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2")
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
for id, name in enumerate(obj.get_speakers()):
w2v2_vits_obj.append([int(id), obj, obj_id])
w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
new_id += 1
voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
w2v2_emotion_count = len(emotion_reference) if emotion_reference is not None else 0
tts = TTS(voice_obj, voice_speakers, w2v2_emotion_count=w2v2_emotion_count)
return tts
|