import os import json import logging import torch import config import numpy as np from utils.utils import check_is_none from vits import VITS from voice import TTS device = torch.device("cuda" if torch.cuda.is_available() else "cpu") lang_dict = { "english_cleaners": ["en"], "english_cleaners2": ["en"], "japanese_cleaners": ["ja"], "japanese_cleaners2": ["ja"], "korean_cleaners": ["ko"], "chinese_cleaners": ["zh"], "zh_ja_mixture_cleaners": ["zh", "ja"], "sanskrit_cleaners": ["sa"], "cjks_cleaners": ["zh", "ja", "ko", "sa"], "cjke_cleaners": ["zh", "ja", "ko", "en"], "cjke_cleaners2": ["zh", "ja", "ko", "en"], "cje_cleaners": ["zh", "ja", "en"], "cje_cleaners2": ["zh", "ja", "en"], "thai_cleaners": ["th"], "shanghainese_cleaners": ["sh"], "chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD", "ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC", "YB"], "bert_chinese_cleaners": ["zh"], } def analysis(model_config_json): model_config = json.load(model_config_json) symbols = model_config.get("symbols", None) emotion_embedding = model_config.get("data").get("emotion_embedding", False) if "use_spk_conditioned_encoder" in model_config.get("model"): model_type = 'bert_vits2' return model_type if symbols != None: if not emotion_embedding: mode_type = "vits" else: mode_type = "w2v2" else: mode_type = "hubert" return mode_type def load_npy(model_): if isinstance(model_, list): # check if is .npy for i in model_: _model_extention = os.path.splitext(i)[1] if _model_extention != ".npy": raise ValueError(f"Unsupported model type: {_model_extention}") # merge npy files emotion_reference = np.empty((0, 1024)) for i in model_: tmp = np.load(i).reshape(-1, 1024) emotion_reference = np.append(emotion_reference, tmp, axis=0) elif os.path.isdir(model_): emotion_reference = np.empty((0, 1024)) for root, dirs, files in os.walk(model_): for file_name in files: # check if is .npy _model_extention = os.path.splitext(file_name)[1] if _model_extention != ".npy": continue file_path = os.path.join(root, file_name) # merge npy files tmp = np.load(file_path).reshape(-1, 1024) emotion_reference = np.append(emotion_reference, tmp, axis=0) elif os.path.isfile(model_): # check if is .npy _model_extention = os.path.splitext(model_)[1] if _model_extention != ".npy": raise ValueError(f"Unsupported model type: {_model_extention}") emotion_reference = np.load(model_) logging.info(f"Loaded emotional dimention npy range:{len(emotion_reference)}") return emotion_reference def merge_model(merging_model): vits_obj = [] vits_speakers = [] hubert_vits_obj = [] hubert_vits_speakers = [] w2v2_vits_obj = [] w2v2_vits_speakers = [] bert_vits2_obj = [] bert_vits2_speakers = [] # model list vits_list = [] hubert_vits_list = [] w2v2_vits_list = [] bert_vits2_list = [] for l in merging_model: with open(l[1], 'r', encoding='utf-8') as model_config: model_type = analysis(model_config) if model_type == "vits": vits_list.append(l) elif model_type == "hubert": hubert_vits_list.append(l) elif model_type == "w2v2": w2v2_vits_list.append(l) elif model_type == "bert_vits2": bert_vits2_list.append(l) # merge vits new_id = 0 for obj_id, i in enumerate(vits_list): obj = VITS(model=i[0], config=i[1], model_type="vits", device=device) lang = lang_dict.get(obj.get_cleaner(), ["unknown"]) for id, name in enumerate(obj.get_speakers()): vits_obj.append([int(id), obj, obj_id]) vits_speakers.append({"id": new_id, "name": name, "lang": lang}) new_id += 1 # merge hubert-vits if len(hubert_vits_list) != 0: if getattr(config, "HUBERT_SOFT_MODEL", None) == None or check_is_none(config.HUBERT_SOFT_MODEL): raise ValueError(f"Please configure HUBERT_SOFT_MODEL path in config.py") try: from vits.hubert_model import hubert_soft hubert = hubert_soft(config.HUBERT_SOFT_MODEL) except Exception as e: raise ValueError(f"Load HUBERT_SOFT_MODEL failed {e}") new_id = 0 for obj_id, i in enumerate(hubert_vits_list): obj = VITS(model=i[0], config=i[1], model_=hubert, model_type="hubert", device=device) lang = lang_dict.get(obj.get_cleaner(), ["unknown"]) for id, name in enumerate(obj.get_speakers()): hubert_vits_obj.append([int(id), obj, obj_id]) hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang}) new_id += 1 # merge w2v2-vits emotion_reference = None if len(w2v2_vits_list) != 0: if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY): raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py") try: emotion_reference = load_npy(config.DIMENSIONAL_EMOTION_NPY) except Exception as e: raise ValueError(f"Load DIMENSIONAL_EMOTION_NPY failed {e}") new_id = 0 for obj_id, i in enumerate(w2v2_vits_list): obj = VITS(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2", device=device) lang = lang_dict.get(obj.get_cleaner(), ["unknown"]) for id, name in enumerate(obj.get_speakers()): w2v2_vits_obj.append([int(id), obj, obj_id]) w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang}) new_id += 1 # merge Bert_VITS2 new_id = 0 for obj_id, i in enumerate(bert_vits2_list): from bert_vits2 import Bert_VITS2 obj = Bert_VITS2(model=i[0], config=i[1], device=device) lang = ["ZH"] for id, name in enumerate(obj.get_speakers()): bert_vits2_obj.append([int(id), obj, obj_id]) bert_vits2_speakers.append({"id": new_id, "name": name, "lang": lang}) new_id += 1 voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj, "BERT-VITS2": bert_vits2_obj} voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers, "BERT-VITS2": bert_vits2_speakers} w2v2_emotion_count = len(emotion_reference) if emotion_reference is not None else 0 tts = TTS(voice_obj, voice_speakers, w2v2_emotion_count=w2v2_emotion_count, device=device) return tts