Spaces:

Mahiruoshi
/

BangDream-Bert-VITS2

Running

App Files Files Community

Mahiruoshi commited on Dec 25, 2023

Commit

5220ea7

1 Parent(s): f4cadb2

Upload 86 files

Browse files

Files changed (47) hide show

bert_gen.py +24 -17
clap_gen.py +1 -1
config.yml +17 -17
data_utils.py +7 -23
export_onnx.py +6 -4
infer.py +53 -100
losses.py +95 -0
models.py +66 -65
onnx_infer.py +68 -0
re_matching.py +0 -1
requirements.txt +2 -3
resample.py +10 -6
resample_legacy.py +71 -0
server.py +214 -47
server_fastapi.py +39 -1
slm/wavlm-base-plus/.gitattributes +27 -0
slm/wavlm-base-plus/README.md +65 -0
slm/wavlm-base-plus/config.json +99 -0
slm/wavlm-base-plus/preprocessor_config.json +9 -0
slm/wavlm-base-plus/pytorch_model.bin +3 -0
text/__init__.py +4 -2
text/__pycache__/__init__.cpython-311.pyc +0 -0
text/__pycache__/bert_utils.cpython-311.pyc +0 -0
text/__pycache__/chinese.cpython-311.pyc +0 -0
text/__pycache__/chinese_bert.cpython-311.pyc +0 -0
text/__pycache__/cleaner.cpython-311.pyc +0 -0
text/__pycache__/english.cpython-311.pyc +0 -0
text/__pycache__/english_bert_mock.cpython-311.pyc +0 -0
text/__pycache__/japanese.cpython-311.pyc +0 -0
text/__pycache__/japanese_bert.cpython-311.pyc +0 -0
text/__pycache__/symbols.cpython-311.pyc +0 -0
text/__pycache__/tone_sandhi.cpython-311.pyc +0 -0
text/chinese_bert.py +21 -3
text/cleaner.py +2 -2
text/english.py +71 -29
text/english_bert_mock.py +21 -2
text/japanese_bert.py +23 -2
text/tone_sandhi.py +7 -3
tools/__pycache__/__init__.cpython-311.pyc +0 -0
tools/__pycache__/classify_language.cpython-311.pyc +0 -0
tools/__pycache__/log.cpython-311.pyc +0 -0
tools/__pycache__/sentence.cpython-311.pyc +0 -0
tools/__pycache__/translate.cpython-311.pyc +0 -0
train_ms.py +172 -58
utils.py +5 -1
webui.py +194 -174
webui_preprocess.py +10 -21

bert_gen.py CHANGED Viewed

@@ -1,17 +1,16 @@
-import argparse
-from multiprocessing import Pool, cpu_count
 import torch
-import torch.multiprocessing as mp
-from tqdm import tqdm
 import commons
 import utils
 from config import config
-from text import cleaned_text_to_sequence, get_bert
-def process_line(line):
     device = config.bert_gen_config.device
     if config.bert_gen_config.use_multi_device:
         rank = mp.current_process()._identity
@@ -28,12 +27,13 @@ def process_line(line):
     word2ph = [i for i in word2ph]
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
-    phone = commons.intersperse(phone, 0)
-    tone = commons.intersperse(tone, 0)
-    language = commons.intersperse(language, 0)
-    for i in range(len(word2ph)):
-        word2ph[i] = word2ph[i] * 2
-    word2ph[0] += 1
     bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")
@@ -59,16 +59,23 @@ if __name__ == "__main__":
     args, _ = parser.parse_known_args()
     config_path = args.config
     hps = utils.get_hparams_from_file(config_path)
     lines = []
     with open(hps.data.training_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
     with open(hps.data.validation_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
     if len(lines) != 0:
-        num_processes = min(args.num_processes, cpu_count())
         with Pool(processes=num_processes) as pool:
-            for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
-                pass
     print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!")

 import torch
+from multiprocessing import Pool
 import commons
 import utils
+from tqdm import tqdm
+from text import check_bert_models, cleaned_text_to_sequence, get_bert
+import argparse
+import torch.multiprocessing as mp
 from config import config
+def process_line(x):
+    line, add_blank = x
     device = config.bert_gen_config.device
     if config.bert_gen_config.use_multi_device:
         rank = mp.current_process()._identity
     word2ph = [i for i in word2ph]
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+    if add_blank:
+        phone = commons.intersperse(phone, 0)
+        tone = commons.intersperse(tone, 0)
+        language = commons.intersperse(language, 0)
+        for i in range(len(word2ph)):
+            word2ph[i] = word2ph[i] * 2
+        word2ph[0] += 1
     bert_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".bert.pt")
     args, _ = parser.parse_known_args()
     config_path = args.config
     hps = utils.get_hparams_from_file(config_path)
+    check_bert_models()
     lines = []
     with open(hps.data.training_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
     with open(hps.data.validation_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
+    add_blank = [hps.data.add_blank] * len(lines)
     if len(lines) != 0:
+        num_processes = args.num_processes
         with Pool(processes=num_processes) as pool:
+            for _ in tqdm(
+                pool.imap_unordered(process_line, zip(lines, add_blank)),
+                total=len(lines),
+            ):
+                # 这里是缩进的代码块，表示循环体
+                pass  # 使用pass语句作为占位符
     print(f"bert生成完毕!, 共有{len(lines)}个bert.pt生成!")

clap_gen.py CHANGED Viewed

@@ -27,7 +27,7 @@ def process_line(line):
             device = torch.device("cpu")
     wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
-    clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy")
     if os.path.isfile(clap_path):
         return

             device = torch.device("cpu")
     wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
+    clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.pt")
     if os.path.isfile(clap_path):
         return

config.yml CHANGED Viewed

@@ -4,7 +4,7 @@
 # 拟提供通用路径配置，统一存放数据，避免数据放得很乱
 # 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
 # 不填或者填空则路径为相对于项目根目录的路径
-dataset_path: "Data/"
 # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
 mirror: ""
@@ -17,16 +17,16 @@ resample:
   sampling_rate: 44100
   # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
   # 请填入相对于datasetPath的相对路径
-  in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
   # 音频文件重采样后输出路径
-  out_dir: "audios/wavs"
 # preprocess_text 数据集预处理相关配置
 # 注意， “:” 后需要加空格
 preprocess_text:
   # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
-  transcription_path: "filelists/你的数据集文本.list"
   # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
   cleaned_path: ""
   # 训练集路径
@@ -34,11 +34,11 @@ preprocess_text:
   # 验证集路径
   val_path: "filelists/val.list"
   # 配置文件路径
-  config_path: "config.json"
   # 每个语言的验证集条数
   val_per_lang: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
-  max_val_total: 12
   # 是否进行数据清洗
   clean: true
@@ -47,7 +47,7 @@ preprocess_text:
 # 注意， “:” 后需要加空格
 bert_gen:
   # 训练数据集配置文件路径
-  config_path: "config.json"
   # 并行数
   num_processes: 4
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
@@ -60,9 +60,9 @@ bert_gen:
 # 注意， “:” 后需要加空格
 emo_gen:
   # 训练数据集配置文件路径
-  config_path: "config.json"
   # 并行数
-  num_processes: 4
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   device: "cuda"
   # 使用多卡推理
@@ -81,15 +81,15 @@ train_ms:
     # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
   # 底模设置
   base:
-    use_base_model: false
     repo_id: "Stardust_minus/Bert-VITS2"
-    model_image: "Bert-VITS2_2.2-Clap底模" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
-  config_path: "config.json"
   # 训练使用的worker，不建议超过CPU核心数
-  num_workers: 16
   # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
   spec_cache: True
   # 保存的检查点数量，多于此数目的权重会被删除来节省空间。
@@ -102,9 +102,9 @@ webui:
   # 推理设备
   device: "cuda"
   # 模型路径
-  model: "models/G_8000.pth"
   # 配置文件路径
-  config_path: "config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
@@ -172,6 +172,6 @@ server:
 # 请不要在github等网站公开分享你的app id 与 key
 translate:
   # 你的APPID
-  "app_key": ""
   # 你的密钥
-  "secret_key": ""

 # 拟提供通用路径配置，统一存放数据，避免数据放得很乱
 # 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
 # 不填或者填空则路径为相对于项目根目录的路径
+dataset_path: "Data/V23"
 # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
 mirror: ""
   sampling_rate: 44100
   # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
   # 请填入相对于datasetPath的相对路径
+  in_dir: "" # 相对于根目录的路径为 /datasetPath/in_dir
   # 音频文件重采样后输出路径
+  out_dir: ""
 # preprocess_text 数据集预处理相关配置
 # 注意， “:” 后需要加空格
 preprocess_text:
   # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
+  transcription_path: "filelists/whole.list"
   # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
   cleaned_path: ""
   # 训练集路径
   # 验证集路径
   val_path: "filelists/val.list"
   # 配置文件路径
+  config_path: "configs/config.json"
   # 每个语言的验证集条数
   val_per_lang: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
+  max_val_total: 800
   # 是否进行数据清洗
   clean: true
 # 注意， “:” 后需要加空格
 bert_gen:
   # 训练数据集配置文件路径
+  config_path: "configs/config.json"
   # 并行数
   num_processes: 4
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
 # 注意， “:” 后需要加空格
 emo_gen:
   # 训练数据集配置文件路径
+  config_path: "configs/config.json"
   # 并行数
+  num_processes: 16
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   device: "cuda"
   # 使用多卡推理
     # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
   # 底模设置
   base:
+    use_base_model: True
     repo_id: "Stardust_minus/Bert-VITS2"
+    model_image: "Bert-VITS2_2.3底模" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
+  config_path: "configs/config.json"
   # 训练使用的worker，不建议超过CPU核心数
+  num_workers: 22
   # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
   spec_cache: True
   # 保存的检查点数量，多于此数目的权重会被删除来节省空间。
   # 推理设备
   device: "cuda"
   # 模型路径
+  model: "models/G_408000.pth"
   # 配置文件路径
+  config_path: "configs/config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
 # 请不要在github等网站公开分享你的app id 与 key
 translate:
   # 你的APPID
+  "app_key": "20231117001883321"
   # 你的密钥
+  "secret_key": "lMQbvZHeJveDceLof2wf"

data_utils.py CHANGED Viewed

@@ -44,10 +44,6 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         self.min_text_len = getattr(hparams, "min_text_len", 1)
         self.max_text_len = getattr(hparams, "max_text_len", 384)
-        self.empty_emo = torch.squeeze(
-            torch.load("empty_emo.npy", map_location="cpu"), dim=1
-        )
         random.seed(1234)
         random.shuffle(self.audiopaths_sid_text)
         self._filter()
@@ -98,14 +94,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         spec, wav = self.get_audio(audiopath)
         sid = torch.LongTensor([int(self.spk_map[sid])])
-        if np.random.rand() > 0.1:
-            emo = torch.squeeze(
-                torch.load(audiopath.replace(".wav", ".emo.npy"), map_location="cpu"),
-                dim=1,
-            )
-        else:
-            emo = self.empty_emo
-        return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo)
     def get_audio(self, filename):
         audio, sampling_rate = load_wav_to_torch(filename)
@@ -168,15 +157,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         if language_str == "ZH":
             bert = bert_ori
-            ja_bert = torch.rand(1024, len(phone))
-            en_bert = torch.rand(1024, len(phone))
         elif language_str == "JP":
-            bert = torch.rand(1024, len(phone))
             ja_bert = bert_ori
-            en_bert = torch.rand(1024, len(phone))
         elif language_str == "EN":
-            bert = torch.rand(1024, len(phone))
-            ja_bert = torch.rand(1024, len(phone))
             en_bert = bert_ori
         phone = torch.LongTensor(phone)
         tone = torch.LongTensor(tone)
@@ -226,7 +215,6 @@ class TextAudioSpeakerCollate:
         bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
-        emo = torch.FloatTensor(len(batch), 512)
         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
@@ -238,7 +226,6 @@ class TextAudioSpeakerCollate:
         bert_padded.zero_()
         ja_bert_padded.zero_()
         en_bert_padded.zero_()
-        emo.zero_()
         for i in range(len(ids_sorted_decreasing)):
             row = batch[ids_sorted_decreasing[i]]
@@ -272,8 +259,6 @@ class TextAudioSpeakerCollate:
             en_bert = row[8]
             en_bert_padded[i, :, : en_bert.size(1)] = en_bert
-            emo[i, :] = row[9]
         return (
             text_padded,
             text_lengths,
@@ -287,7 +272,6 @@ class TextAudioSpeakerCollate:
             bert_padded,
             ja_bert_padded,
             en_bert_padded,
-            emo,
         )

         self.min_text_len = getattr(hparams, "min_text_len", 1)
         self.max_text_len = getattr(hparams, "max_text_len", 384)
         random.seed(1234)
         random.shuffle(self.audiopaths_sid_text)
         self._filter()
         spec, wav = self.get_audio(audiopath)
         sid = torch.LongTensor([int(self.spk_map[sid])])
+        return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert)
     def get_audio(self, filename):
         audio, sampling_rate = load_wav_to_torch(filename)
         if language_str == "ZH":
             bert = bert_ori
+            ja_bert = torch.randn(1024, len(phone))
+            en_bert = torch.randn(1024, len(phone))
         elif language_str == "JP":
+            bert = torch.randn(1024, len(phone))
             ja_bert = bert_ori
+            en_bert = torch.randn(1024, len(phone))
         elif language_str == "EN":
+            bert = torch.randn(1024, len(phone))
+            ja_bert = torch.randn(1024, len(phone))
             en_bert = bert_ori
         phone = torch.LongTensor(phone)
         tone = torch.LongTensor(tone)
         bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
         bert_padded.zero_()
         ja_bert_padded.zero_()
         en_bert_padded.zero_()
         for i in range(len(ids_sorted_decreasing)):
             row = batch[ids_sorted_decreasing[i]]
             en_bert = row[8]
             en_bert_padded[i, :, : en_bert.size(1)] = en_bert
         return (
             text_padded,
             text_lengths,
             bert_padded,
             ja_bert_padded,
             en_bert_padded,
         )

export_onnx.py CHANGED Viewed

@@ -2,11 +2,13 @@ from onnx_modules import export_onnx
 import os
 if __name__ == "__main__":
-    export_path = "BertVits2.2PT"
-    model_path = "model\\G_0.pth"
-    config_path = "model\\config.json"
     if not os.path.exists("onnx"):
         os.makedirs("onnx")
     if not os.path.exists(f"onnx/{export_path}"):
         os.makedirs(f"onnx/{export_path}")
-    export_onnx(export_path, model_path, config_path)

 import os
 if __name__ == "__main__":
+    export_path = "BangDreamApi"
+    model_path = "Data/V23/models/G_621000.pth"
+    config_path = "Data/V23/configs/config.json"
+    novq = False
+    dev = False
     if not os.path.exists("onnx"):
         os.makedirs("onnx")
     if not os.path.exists(f"onnx/{export_path}"):
         os.makedirs(f"onnx/{export_path}")
+    export_onnx(export_path, model_path, config_path, novq, dev)

infer.py CHANGED Viewed

@@ -10,7 +10,8 @@
 import torch
 import commons
 from text import cleaned_text_to_sequence, get_bert
-from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
 from text.cleaner import clean_text
 import utils
 import numpy as np
@@ -32,7 +33,7 @@ from oldVersion.V101.text import symbols as V101symbols
 from oldVersion import V111, V110, V101, V200, V210
 # 当前版本信息
-latest_version = "2.2"
 # 版本兼容
 SynthesizerTrnMap = {
@@ -98,7 +99,8 @@ def get_net_g(model_path: str, version: str, device: str, hps):
     return net_g
-def get_text(text, language_str, hps, device):
     # 在此处实现当前版本的get_text
     norm_text, phone, tone, word2ph = clean_text(text, language_str)
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
@@ -110,21 +112,23 @@ def get_text(text, language_str, hps, device):
         for i in range(len(word2ph)):
             word2ph[i] = word2ph[i] * 2
         word2ph[0] += 1
-    bert_ori = get_bert(norm_text, word2ph, language_str, device)
     del word2ph
     assert bert_ori.shape[-1] == len(phone), phone
     if language_str == "ZH":
         bert = bert_ori
-        ja_bert = torch.rand(1024, len(phone))
-        en_bert = torch.rand(1024, len(phone))
     elif language_str == "JP":
-        bert = torch.rand(1024, len(phone))
         ja_bert = bert_ori
-        en_bert = torch.rand(1024, len(phone))
     elif language_str == "EN":
-        bert = torch.rand(1024, len(phone))
-        ja_bert = torch.rand(1024, len(phone))
         en_bert = bert_ori
     else:
         raise ValueError("language_str should be ZH, JP or EN")
@@ -154,84 +158,17 @@ def infer(
     reference_audio=None,
     skip_start=False,
     skip_end=False,
 ):
-    # 2.2版本参数位置变了
-    # 2.1 参数新增 emotion reference_audio skip_start skip_end
-    inferMap_V3 = {
-        "2.1": V210.infer,
-    }
-    # 支持中日英三语版本
-    inferMap_V2 = {
-        "2.0.2-fix": V200.infer,
-        "2.0.1": V200.infer,
-        "2.0": V200.infer,
-        "1.1.1-fix": V111.infer_fix,
-        "1.1.1": V111.infer,
-        "1.1": V110.infer,
-        "1.1.0": V110.infer,
-    }
-    # 仅支持中文版本
-    # 在测试中，并未发现两个版本的模型不能互相通用
-    inferMap_V1 = {
-        "1.0.1": V101.infer,
-        "1.0": V101.infer,
-        "1.0.0": V101.infer,
-    }
-    version = hps.version if hasattr(hps, "version") else latest_version
-    # 非当前版本，根据版本号选择合适的infer
-    if version != latest_version:
-        if version in inferMap_V3.keys():
-            return inferMap_V3[version](
-                text,
-                sdp_ratio,
-                noise_scale,
-                noise_scale_w,
-                length_scale,
-                sid,
-                language,
-                hps,
-                net_g,
-                device,
-                reference_audio,
-                emotion,
-                skip_start,
-                skip_end,
-            )
-        if version in inferMap_V2.keys():
-            return inferMap_V2[version](
-                text,
-                sdp_ratio,
-                noise_scale,
-                noise_scale_w,
-                length_scale,
-                sid,
-                language,
-                hps,
-                net_g,
-                device,
-            )
-        if version in inferMap_V1.keys():
-            return inferMap_V1[version](
-                text,
-                sdp_ratio,
-                noise_scale,
-                noise_scale_w,
-                length_scale,
-                sid,
-                hps,
-                net_g,
-                device,
-            )
-    # 在此处实现当前版本的推理
-    # emo = get_emo_(reference_audio, emotion, sid)
-    if isinstance(reference_audio, np.ndarray):
-        emo = get_clap_audio_feature(reference_audio, device)
-    else:
-        emo = get_clap_text_feature(emotion, device)
-    emo = torch.squeeze(emo, dim=1)
     bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
-        text, language, hps, device
     )
     if skip_start:
         phones = phones[3:]
@@ -255,7 +192,7 @@ def infer(
         ja_bert = ja_bert.to(device).unsqueeze(0)
         en_bert = en_bert.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
-        emo = emo.to(device).unsqueeze(0)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
         audio = (
@@ -268,7 +205,6 @@ def infer(
                 bert,
                 ja_bert,
                 en_bert,
-                emo,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
@@ -278,7 +214,16 @@ def infer(
             .float()
             .numpy()
         )
-        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         return audio
@@ -302,14 +247,14 @@ def infer_multilang(
 ):
     bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], []
     # emo = get_emo_(reference_audio, emotion, sid)
-    if isinstance(reference_audio, np.ndarray):
-        emo = get_clap_audio_feature(reference_audio, device)
-    else:
-        emo = get_clap_text_feature(emotion, device)
-    emo = torch.squeeze(emo, dim=1)
     for idx, (txt, lang) in enumerate(zip(text, language)):
-        skip_start = (idx != 0) or (skip_start and idx == 0)
-        skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1)
         (
             temp_bert,
             temp_ja_bert,
@@ -318,14 +263,14 @@ def infer_multilang(
             temp_tones,
             temp_lang_ids,
         ) = get_text(txt, lang, hps, device)
-        if skip_start:
             temp_bert = temp_bert[:, 3:]
             temp_ja_bert = temp_ja_bert[:, 3:]
             temp_en_bert = temp_en_bert[:, 3:]
             temp_phones = temp_phones[3:]
             temp_tones = temp_tones[3:]
             temp_lang_ids = temp_lang_ids[3:]
-        if skip_end:
             temp_bert = temp_bert[:, :-2]
             temp_ja_bert = temp_ja_bert[:, :-2]
             temp_en_bert = temp_en_bert[:, :-2]
@@ -351,7 +296,7 @@ def infer_multilang(
         bert = bert.to(device).unsqueeze(0)
         ja_bert = ja_bert.to(device).unsqueeze(0)
         en_bert = en_bert.to(device).unsqueeze(0)
-        emo = emo.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
@@ -365,7 +310,6 @@ def infer_multilang(
                 bert,
                 ja_bert,
                 en_bert,
-                emo,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
@@ -375,7 +319,16 @@ def infer_multilang(
             .float()
             .numpy()
         )
-        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         return audio

 import torch
 import commons
 from text import cleaned_text_to_sequence, get_bert
+# from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
 from text.cleaner import clean_text
 import utils
 import numpy as np
 from oldVersion import V111, V110, V101, V200, V210
 # 当前版本信息
+latest_version = "2.3"
 # 版本兼容
 SynthesizerTrnMap = {
     return net_g
+def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7):
+    style_text = None if style_text == "" else style_text
     # 在此处实现当前版本的get_text
     norm_text, phone, tone, word2ph = clean_text(text, language_str)
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
         for i in range(len(word2ph)):
             word2ph[i] = word2ph[i] * 2
         word2ph[0] += 1
+    bert_ori = get_bert(
+        norm_text, word2ph, language_str, device, style_text, style_weight
+    )
     del word2ph
     assert bert_ori.shape[-1] == len(phone), phone
     if language_str == "ZH":
         bert = bert_ori
+        ja_bert = torch.randn(1024, len(phone))
+        en_bert = torch.randn(1024, len(phone))
     elif language_str == "JP":
+        bert = torch.randn(1024, len(phone))
         ja_bert = bert_ori
+        en_bert = torch.randn(1024, len(phone))
     elif language_str == "EN":
+        bert = torch.randn(1024, len(phone))
+        ja_bert = torch.randn(1024, len(phone))
         en_bert = bert_ori
     else:
         raise ValueError("language_str should be ZH, JP or EN")
     reference_audio=None,
     skip_start=False,
     skip_end=False,
+    style_text=None,
+    style_weight=0.7,
 ):
     bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
+        text,
+        language,
+        hps,
+        device,
+        style_text=style_text,
+        style_weight=style_weight,
     )
     if skip_start:
         phones = phones[3:]
         ja_bert = ja_bert.to(device).unsqueeze(0)
         en_bert = en_bert.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        # emo = emo.to(device).unsqueeze(0)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
         audio = (
                 bert,
                 ja_bert,
                 en_bert,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
             .float()
             .numpy()
         )
+        del (
+            x_tst,
+            tones,
+            lang_ids,
+            bert,
+            x_tst_lengths,
+            speakers,
+            ja_bert,
+            en_bert,
+        )  # , emo
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         return audio
 ):
     bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], []
     # emo = get_emo_(reference_audio, emotion, sid)
+    # if isinstance(reference_audio, np.ndarray):
+    #     emo = get_clap_audio_feature(reference_audio, device)
+    # else:
+    #     emo = get_clap_text_feature(emotion, device)
+    # emo = torch.squeeze(emo, dim=1)
     for idx, (txt, lang) in enumerate(zip(text, language)):
+        _skip_start = (idx != 0) or (skip_start and idx == 0)
+        _skip_end = (idx != len(language) - 1) or skip_end
         (
             temp_bert,
             temp_ja_bert,
             temp_tones,
             temp_lang_ids,
         ) = get_text(txt, lang, hps, device)
+        if _skip_start:
             temp_bert = temp_bert[:, 3:]
             temp_ja_bert = temp_ja_bert[:, 3:]
             temp_en_bert = temp_en_bert[:, 3:]
             temp_phones = temp_phones[3:]
             temp_tones = temp_tones[3:]
             temp_lang_ids = temp_lang_ids[3:]
+        if _skip_end:
             temp_bert = temp_bert[:, :-2]
             temp_ja_bert = temp_ja_bert[:, :-2]
             temp_en_bert = temp_en_bert[:, :-2]
         bert = bert.to(device).unsqueeze(0)
         ja_bert = ja_bert.to(device).unsqueeze(0)
         en_bert = en_bert.to(device).unsqueeze(0)
+        # emo = emo.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
                 bert,
                 ja_bert,
                 en_bert,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
             .float()
             .numpy()
         )
+        del (
+            x_tst,
+            tones,
+            lang_ids,
+            bert,
+            x_tst_lengths,
+            speakers,
+            ja_bert,
+            en_bert,
+        )  # , emo
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         return audio

losses.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import torch
 def feature_loss(fmap_r, fmap_g):
@@ -56,3 +58,96 @@ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
     kl = torch.sum(kl * z_mask)
     l = kl / torch.sum(z_mask)
     return l

 import torch
+import torchaudio
+from transformers import AutoModel
 def feature_loss(fmap_r, fmap_g):
     kl = torch.sum(kl * z_mask)
     l = kl / torch.sum(z_mask)
     return l
+class WavLMLoss(torch.nn.Module):
+    def __init__(self, model, wd, model_sr, slm_sr=16000):
+        super(WavLMLoss, self).__init__()
+        self.wavlm = AutoModel.from_pretrained(model)
+        self.wd = wd
+        self.resample = torchaudio.transforms.Resample(model_sr, slm_sr)
+        self.wavlm.eval()
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+    def forward(self, wav, y_rec):
+        with torch.no_grad():
+            wav_16 = self.resample(wav)
+            wav_embeddings = self.wavlm(
+                input_values=wav_16, output_hidden_states=True
+            ).hidden_states
+        y_rec_16 = self.resample(y_rec)
+        y_rec_embeddings = self.wavlm(
+            input_values=y_rec_16.squeeze(), output_hidden_states=True
+        ).hidden_states
+        floss = 0
+        for er, eg in zip(wav_embeddings, y_rec_embeddings):
+            floss += torch.mean(torch.abs(er - eg))
+        return floss.mean()
+    def generator(self, y_rec):
+        y_rec_16 = self.resample(y_rec)
+        y_rec_embeddings = self.wavlm(
+            input_values=y_rec_16, output_hidden_states=True
+        ).hidden_states
+        y_rec_embeddings = (
+            torch.stack(y_rec_embeddings, dim=1)
+            .transpose(-1, -2)
+            .flatten(start_dim=1, end_dim=2)
+        )
+        y_df_hat_g = self.wd(y_rec_embeddings)
+        loss_gen = torch.mean((1 - y_df_hat_g) ** 2)
+        return loss_gen
+    def discriminator(self, wav, y_rec):
+        with torch.no_grad():
+            wav_16 = self.resample(wav)
+            wav_embeddings = self.wavlm(
+                input_values=wav_16, output_hidden_states=True
+            ).hidden_states
+            y_rec_16 = self.resample(y_rec)
+            y_rec_embeddings = self.wavlm(
+                input_values=y_rec_16, output_hidden_states=True
+            ).hidden_states
+            y_embeddings = (
+                torch.stack(wav_embeddings, dim=1)
+                .transpose(-1, -2)
+                .flatten(start_dim=1, end_dim=2)
+            )
+            y_rec_embeddings = (
+                torch.stack(y_rec_embeddings, dim=1)
+                .transpose(-1, -2)
+                .flatten(start_dim=1, end_dim=2)
+            )
+        y_d_rs = self.wd(y_embeddings)
+        y_d_gs = self.wd(y_rec_embeddings)
+        y_df_hat_r, y_df_hat_g = y_d_rs, y_d_gs
+        r_loss = torch.mean((1 - y_df_hat_r) ** 2)
+        g_loss = torch.mean((y_df_hat_g) ** 2)
+        loss_disc_f = r_loss + g_loss
+        return loss_disc_f.mean()
+    def discriminator_forward(self, wav):
+        with torch.no_grad():
+            wav_16 = self.resample(wav)
+            wav_embeddings = self.wavlm(
+                input_values=wav_16, output_hidden_states=True
+            ).hidden_states
+            y_embeddings = (
+                torch.stack(wav_embeddings, dim=1)
+                .transpose(-1, -2)
+                .flatten(start_dim=1, end_dim=2)
+            )
+        y_d_rs = self.wd(y_embeddings)
+        return y_d_rs

models.py CHANGED Viewed

@@ -40,33 +40,22 @@ class DurationDiscriminator(nn.Module):  # vits2
         self.norm_2 = modules.LayerNorm(filter_channels)
         self.dur_proj = nn.Conv1d(1, filter_channels, 1)
-        self.pre_out_conv_1 = nn.Conv1d(
-            2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
         )
-        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
-        self.pre_out_conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
         if gin_channels != 0:
             self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-        self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
-    def forward_probability(self, x, x_mask, dur, g=None):
         dur = self.dur_proj(dur)
         x = torch.cat([x, dur], dim=1)
-        x = self.pre_out_conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.pre_out_norm_1(x)
-        x = self.drop(x)
-        x = self.pre_out_conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.pre_out_norm_2(x)
-        x = self.drop(x)
-        x = x * x_mask
         x = x.transpose(1, 2)
         output_prob = self.output_layer(x)
         return output_prob
@@ -86,7 +75,7 @@ class DurationDiscriminator(nn.Module):  # vits2
         output_probs = []
         for dur in [dur_r, dur_hat]:
-            output_prob = self.forward_probability(x, x_mask, dur, g)
             output_probs.append(output_prob)
         return output_probs
@@ -354,7 +343,6 @@ class TextEncoder(nn.Module):
         n_layers,
         kernel_size,
         p_dropout,
-        n_speakers,
         gin_channels=0,
     ):
         super().__init__()
@@ -376,31 +364,6 @@ class TextEncoder(nn.Module):
         self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        # self.emo_proj = nn.Linear(512, hidden_channels)
-        self.in_feature_net = nn.Sequential(
-            # input is assumed to an already normalized embedding
-            nn.Linear(512, 1028, bias=False),
-            nn.GELU(),
-            nn.LayerNorm(1028),
-            *[Block(1028, 512) for _ in range(1)],
-            nn.Linear(1028, 512, bias=False),
-            # normalize before passing to VQ?
-            # nn.GELU(),
-            # nn.LayerNorm(512),
-        )
-        self.emo_vq = VectorQuantize(
-            dim=512,
-            codebook_size=64,
-            codebook_dim=32,
-            commitment_weight=0.1,
-            decay=0.85,
-            heads=32,
-            kmeans_iters=20,
-            separate_codebook_per_head=True,
-            stochastic_sample_codes=True,
-            threshold_ema_dead_code=2,
-        )
-        self.out_feature_net = nn.Linear(512, hidden_channels)
         self.encoder = attentions.Encoder(
             hidden_channels,
@@ -413,18 +376,10 @@ class TextEncoder(nn.Module):
         )
         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(
-        self, x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=None
-    ):
-        sid = sid.cpu()
         bert_emb = self.bert_proj(bert).transpose(1, 2)
         ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
         en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
-        emo_emb = self.in_feature_net(emo)
-        emo_emb, _, loss_commit = self.emo_vq(emo_emb.unsqueeze(1))
-        loss_commit = loss_commit.mean()
-        emo_emb = self.out_feature_net(emo_emb)
-        # emo_emb = self.emo_proj(emo.unsqueeze(1))
         x = (
             self.emb(x)
             + self.tone_emb(tone)
@@ -432,7 +387,6 @@ class TextEncoder(nn.Module):
             + bert_emb
             + ja_bert_emb
             + en_bert_emb
-            + emo_emb
         ) * math.sqrt(
             self.hidden_channels
         )  # [b, t, h]
@@ -445,7 +399,7 @@ class TextEncoder(nn.Module):
         stats = self.proj(x) * x_mask
         m, logs = torch.split(stats, self.out_channels, dim=1)
-        return x, m, logs, x_mask, loss_commit
 class ResidualCouplingBlock(nn.Module):
@@ -748,6 +702,55 @@ class MultiPeriodDiscriminator(torch.nn.Module):
         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
 class ReferenceEncoder(nn.Module):
     """
     inputs --- [N, Ty/r, n_mels*r]  mels
@@ -878,7 +881,6 @@ class SynthesizerTrn(nn.Module):
             n_layers,
             kernel_size,
             p_dropout,
-            self.n_speakers,
             gin_channels=self.enc_gin_channels,
         )
         self.dec = Generator(
@@ -946,14 +948,13 @@ class SynthesizerTrn(nn.Module):
         bert,
         ja_bert,
         en_bert,
-        emo=None,
     ):
         if self.n_speakers > 0:
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:
             g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
-        x, m_p, logs_p, x_mask, loss_commit = self.enc_p(
-            x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g
         )
         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
         z_p = self.flow(z, y_mask, g=g)
@@ -996,9 +997,11 @@ class SynthesizerTrn(nn.Module):
         logw_ = torch.log(w + 1e-6) * x_mask
         logw = self.dp(x, x_mask, g=g)
         l_length_dp = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(
             x_mask
         )  # for averaging
         l_length = l_length_dp + l_length_sdp
@@ -1018,9 +1021,8 @@ class SynthesizerTrn(nn.Module):
             x_mask,
             y_mask,
             (z, z_p, m_p, logs_p, m_q, logs_q),
-            (x, logw, logw_),
             g,
-            loss_commit,
         )
     def infer(
@@ -1033,7 +1035,6 @@ class SynthesizerTrn(nn.Module):
         bert,
         ja_bert,
         en_bert,
-        emo=None,
         noise_scale=0.667,
         length_scale=1,
         noise_scale_w=0.8,
@@ -1047,8 +1048,8 @@ class SynthesizerTrn(nn.Module):
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:
             g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
-        x, m_p, logs_p, x_mask, _ = self.enc_p(
-            x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g
         )
         logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
             sdp_ratio

         self.norm_2 = modules.LayerNorm(filter_channels)
         self.dur_proj = nn.Conv1d(1, filter_channels, 1)
+        self.LSTM = nn.LSTM(
+            2 * filter_channels, filter_channels, batch_first=True, bidirectional=True
         )
         if gin_channels != 0:
             self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+        self.output_layer = nn.Sequential(
+            nn.Linear(2 * filter_channels, 1), nn.Sigmoid()
+        )
+    def forward_probability(self, x, dur):
         dur = self.dur_proj(dur)
         x = torch.cat([x, dur], dim=1)
         x = x.transpose(1, 2)
+        x, _ = self.LSTM(x)
         output_prob = self.output_layer(x)
         return output_prob
         output_probs = []
         for dur in [dur_r, dur_hat]:
+            output_prob = self.forward_probability(x, dur)
             output_probs.append(output_prob)
         return output_probs
         n_layers,
         kernel_size,
         p_dropout,
         gin_channels=0,
     ):
         super().__init__()
         self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.encoder = attentions.Encoder(
             hidden_channels,
         )
         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, tone, language, bert, ja_bert, en_bert, g=None):
         bert_emb = self.bert_proj(bert).transpose(1, 2)
         ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
         en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
         x = (
             self.emb(x)
             + self.tone_emb(tone)
             + bert_emb
             + ja_bert_emb
             + en_bert_emb
         ) * math.sqrt(
             self.hidden_channels
         )  # [b, t, h]
         stats = self.proj(x) * x_mask
         m, logs = torch.split(stats, self.out_channels, dim=1)
+        return x, m, logs, x_mask
 class ResidualCouplingBlock(nn.Module):
         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class WavLMDiscriminator(nn.Module):
+    """docstring for Discriminator."""
+    def __init__(
+        self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False
+    ):
+        super(WavLMDiscriminator, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.pre = norm_f(
+            Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0)
+        )
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    nn.Conv1d(
+                        initial_channel, initial_channel * 2, kernel_size=5, padding=2
+                    )
+                ),
+                norm_f(
+                    nn.Conv1d(
+                        initial_channel * 2,
+                        initial_channel * 4,
+                        kernel_size=5,
+                        padding=2,
+                    )
+                ),
+                norm_f(
+                    nn.Conv1d(initial_channel * 4, initial_channel * 4, 5, 1, padding=2)
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(initial_channel * 4, 1, 3, 1, padding=1))
+    def forward(self, x):
+        x = self.pre(x)
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        x = torch.flatten(x, 1, -1)
+        return x
 class ReferenceEncoder(nn.Module):
     """
     inputs --- [N, Ty/r, n_mels*r]  mels
             n_layers,
             kernel_size,
             p_dropout,
             gin_channels=self.enc_gin_channels,
         )
         self.dec = Generator(
         bert,
         ja_bert,
         en_bert,
     ):
         if self.n_speakers > 0:
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:
             g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
+        x, m_p, logs_p, x_mask = self.enc_p(
+            x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g
         )
         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
         z_p = self.flow(z, y_mask, g=g)
         logw_ = torch.log(w + 1e-6) * x_mask
         logw = self.dp(x, x_mask, g=g)
+        logw_sdp = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=1.0)
         l_length_dp = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(
             x_mask
         )  # for averaging
+        l_length_sdp += torch.sum((logw_sdp - logw_) ** 2, [1, 2]) / torch.sum(x_mask)
         l_length = l_length_dp + l_length_sdp
             x_mask,
             y_mask,
             (z, z_p, m_p, logs_p, m_q, logs_q),
+            (x, logw, logw_, logw_sdp),
             g,
         )
     def infer(
         bert,
         ja_bert,
         en_bert,
         noise_scale=0.667,
         length_scale=1,
         noise_scale_w=0.8,
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:
             g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
+        x, m_p, logs_p, x_mask = self.enc_p(
+            x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g
         )
         logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
             sdp_ratio

onnx_infer.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from onnx_modules.V220_OnnxInference import OnnxInferenceSession
+import numpy as np
+Session = OnnxInferenceSession(
+        {
+        "enc" : "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.onnx",
+        "emb_g" : "onnx/BertVits2.2PT/BertVits2.2PT_emb.onnx",
+        "dp" : "onnx/BertVits2.2PT/BertVits2.2PT_dp.onnx",
+        "sdp" : "onnx/BertVits2.2PT/BertVits2.2PT_sdp.onnx",
+        "flow" : "onnx/BertVits2.2PT/BertVits2.2PT_flow.onnx",
+        "dec" : "onnx/BertVits2.2PT/BertVits2.2PT_dec.onnx"
+        },
+        Providers = ["CPUExecutionProvider"]
+    )
+#这里的输入和原版是一样的，只需要在原版预处理结果出来之后加上.numpy()即可
+x = np.array(
+        [
+            0,
+            97,
+            0,
+            8,
+            0,
+            78,
+            0,
+            8,
+            0,
+            76,
+            0,
+            37,
+            0,
+            40,
+            0,
+            97,
+            0,
+            8,
+            0,
+            23,
+            0,
+            8,
+            0,
+            74,
+            0,
+            26,
+            0,
+            104,
+            0,
+        ]
+    )
+tone = np.zeros_like(x)
+language = np.zeros_like(x)
+sid = np.array([0])
+bert = np.random.randn(x.shape[0], 1024)
+ja_bert = np.random.randn(x.shape[0], 1024)
+en_bert = np.random.randn(x.shape[0], 1024)
+emo = np.random.randn(512, 1)
+audio = Session(
+    x,
+    tone,
+    language,
+    bert,
+    ja_bert,
+    en_bert,
+    emo,
+    sid
+)
+print(audio)

re_matching.py CHANGED Viewed

@@ -44,7 +44,6 @@ def text_matching(text: str) -> list:
     result = []
     for speaker, dialogue in matches:
         result.append(extract_language_and_text_updated(speaker, dialogue))
-    print(result)
     return result

     result = []
     for speaker, dialogue in matches:
         result.append(extract_language_and_text_updated(speaker, dialogue))
     return result

requirements.txt CHANGED Viewed

@@ -11,7 +11,7 @@ jieba
 transformers
 pypinyin
 cn2an
-gradio==3.38.0
 av
 mecab-python3
 loguru
@@ -21,8 +21,7 @@ fugashi
 num2words
 PyYAML
 requests
-pyopenjtalk; sys_platform == 'linux'
-openjtalk; sys_platform != 'linux'
 jaconv
 psutil
 GPUtil

 transformers
 pypinyin
 cn2an
+gradio==3.50.2
 av
 mecab-python3
 loguru
 num2words
 PyYAML
 requests
+pyopenjtalk-prebuilt
 jaconv
 psutil
 GPUtil

resample.py CHANGED Viewed

@@ -10,11 +10,11 @@ from config import config
 def process(item):
-    wav_name, args = item
-    wav_path = os.path.join(args.in_dir, wav_name)
     if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"):
         wav, sr = librosa.load(wav_path, sr=args.sr)
-        soundfile.write(os.path.join(args.out_dir, wav_name), wav, sr)
 if __name__ == "__main__":
@@ -54,11 +54,15 @@ if __name__ == "__main__":
     tasks = []
     for dirpath, _, filenames in os.walk(args.in_dir):
-        if not os.path.isdir(args.out_dir):
-            os.makedirs(args.out_dir, exist_ok=True)
         for filename in filenames:
             if filename.lower().endswith(".wav"):
-                tasks.append((filename, args))
     for _ in tqdm(
         pool.imap_unordered(process, tasks),

 def process(item):
+    spkdir, wav_name, args = item
+    wav_path = os.path.join(args.in_dir, spkdir, wav_name)
     if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"):
         wav, sr = librosa.load(wav_path, sr=args.sr)
+        soundfile.write(os.path.join(args.out_dir, spkdir, wav_name), wav, sr)
 if __name__ == "__main__":
     tasks = []
     for dirpath, _, filenames in os.walk(args.in_dir):
+        # 子级目录
+        spk_dir = os.path.relpath(dirpath, args.in_dir)
+        spk_dir_out = os.path.join(args.out_dir, spk_dir)
+        if not os.path.isdir(spk_dir_out):
+            os.makedirs(spk_dir_out, exist_ok=True)
         for filename in filenames:
             if filename.lower().endswith(".wav"):
+                twople = (spk_dir, filename, args)
+                tasks.append(twople)
     for _ in tqdm(
         pool.imap_unordered(process, tasks),

resample_legacy.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import argparse
+import librosa
+from multiprocessing import Pool, cpu_count
+import soundfile
+from tqdm import tqdm
+from config import config
+def process(item):
+    wav_name, args = item
+    wav_path = os.path.join(args.in_dir, wav_name)
+    if os.path.exists(wav_path) and wav_path.lower().endswith(".wav"):
+        wav, sr = librosa.load(wav_path, sr=args.sr)
+        soundfile.write(os.path.join(args.out_dir, wav_name), wav, sr)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--sr",
+        type=int,
+        default=config.resample_config.sampling_rate,
+        help="sampling rate",
+    )
+    parser.add_argument(
+        "--in_dir",
+        type=str,
+        default=config.resample_config.in_dir,
+        help="path to source dir",
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        default=config.resample_config.out_dir,
+        help="path to target dir",
+    )
+    parser.add_argument(
+        "--processes",
+        type=int,
+        default=0,
+        help="cpu_processes",
+    )
+    args, _ = parser.parse_known_args()
+    # autodl 无卡模式会识别出46个cpu
+    if args.processes == 0:
+        processes = cpu_count() - 2 if cpu_count() > 4 else 1
+    else:
+        processes = args.processes
+    pool = Pool(processes=processes)
+    tasks = []
+    for dirpath, _, filenames in os.walk(args.in_dir):
+        if not os.path.isdir(args.out_dir):
+            os.makedirs(args.out_dir, exist_ok=True)
+        for filename in filenames:
+            if filename.lower().endswith(".wav"):
+                tasks.append((filename, args))
+    for _ in tqdm(
+        pool.imap_unordered(process, tasks),
+    ):
+        pass
+    pool.close()
+    pool.join()
+    print("音频重采样完毕!")

server.py CHANGED Viewed

@@ -4,9 +4,6 @@ from pathlib import Path
 import logging
 import re_matching
-import uuid
-from flask import Flask, request, jsonify, render_template_string
-from flask_cors import CORS
 logging.getLogger("numba").setLevel(logging.WARNING)
 logging.getLogger("markdown_it").setLevel(logging.WARNING)
@@ -18,6 +15,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 import librosa
 import numpy as np
 import torch
@@ -25,25 +23,31 @@ import torch.nn as nn
 from torch.utils.data import Dataset
 from torch.utils.data import DataLoader, Dataset
 from tqdm import tqdm
 import utils
 from config import config
-import requests
 import torch
 import commons
 from text import cleaned_text_to_sequence, get_bert
-from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
 from text.cleaner import clean_text
 import utils
 from models import SynthesizerTrn
 from text.symbols import symbols
 import sys
 from scipy.io.wavfile import write
 net_g = None
 device = (
         "cuda:0"
         if torch.cuda.is_available()
@@ -54,7 +58,22 @@ device = (
         )
     )
-#device = 'cpu'
 def get_net_g(model_path: str,  device: str, hps):
     net_g = SynthesizerTrn(
@@ -68,11 +87,11 @@ def get_net_g(model_path: str,  device: str, hps):
     _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
     return net_g
-def get_text(text, language_str, hps, device):
     norm_text, phone, tone, word2ph = clean_text(text, language_str)
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
-    #print(text)
     if hps.data.add_blank:
         phone = commons.intersperse(phone, 0)
         tone = commons.intersperse(tone, 0)
@@ -80,18 +99,24 @@ def get_text(text, language_str, hps, device):
         for i in range(len(word2ph)):
             word2ph[i] = word2ph[i] * 2
         word2ph[0] += 1
-    bert_ori = get_bert(norm_text, word2ph, language_str, device)
     del word2ph
     assert bert_ori.shape[-1] == len(phone), phone
     if language_str == "ZH":
         bert = bert_ori
-        ja_bert = torch.zeros(1024, len(phone))
-        en_bert = torch.zeros(1024, len(phone))
     elif language_str == "JP":
-        bert = torch.zeros(1024, len(phone))
         ja_bert = bert_ori
-        en_bert = torch.zeros(1024, len(phone))
     else:
         raise ValueError("language_str should be ZH, JP or EN")
@@ -104,6 +129,7 @@ def get_text(text, language_str, hps, device):
     language = torch.LongTensor(language)
     return bert, ja_bert, en_bert, phone, tone, language
 def infer(
     text,
     sdp_ratio,
@@ -111,18 +137,18 @@ def infer(
     noise_scale_w,
     length_scale,
     sid,
-    reference_audio=None,
-    emotion='Happy',
 ):
     language= 'JP' if is_japanese(text) else 'ZH'
-    if isinstance(reference_audio, np.ndarray):
-        emo = get_clap_audio_feature(reference_audio, device)
-    else:
-        emo = get_clap_text_feature(emotion, device)
-    emo = torch.squeeze(emo, dim=1)
     bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
-        text, language, hps, device
     )
     with torch.no_grad():
         x_tst = phones.to(device).unsqueeze(0)
@@ -132,7 +158,7 @@ def infer(
         ja_bert = ja_bert.to(device).unsqueeze(0)
         en_bert = en_bert.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
-        emo = emo.to(device).unsqueeze(0)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
         audio = (
@@ -145,7 +171,6 @@ def infer(
                 bert,
                 ja_bert,
                 en_bert,
-                emo,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
@@ -155,7 +180,80 @@ def infer(
             .float()
             .numpy()
         )
-        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         unique_filename = f"temp{uuid.uuid4()}.wav"
@@ -176,19 +274,11 @@ def loadmodel(model):
     except:
         return "error"
-def send_audio_to_server(audio_path,text):
-    url="http://127.0.0.1:3000/response"
-    files = {'file': open(audio_path, 'rb')}
-    data = {'text': text}
-    try:
-        response = requests.post(url, files=files,data=data)
-        return response.status_code, response.text
-    except Exception as e:
-        return 500, str(e)
-app = Flask(__name__)
-CORS(app)
-@app.route('/')
 def tts():
     global last_text, last_model
@@ -197,7 +287,8 @@ def tts():
     noise_scale = float(request.args.get('noise_scale', 0.6))
     noise_scale_w = float(request.args.get('noise_scale_w', 0.8))
     length_scale = float(request.args.get('length_scale', 1))
-    emotion = request.args.get('emotion', 'happy')
     text = request.args.get('text')
     is_chat = request.args.get('is_chat', 'false').lower() == 'true'
     model = request.args.get('model',modelPaths[-1])
@@ -210,7 +301,7 @@ def tts():
                 <title>TTS API Documentation</title>
             </head>
             <body>
-                <iframe src="http://love.soyorin.top" style="width:100%; height:100vh; border:none;"></iframe>
             </body>
             </html>
         """)
@@ -225,9 +316,7 @@ def tts():
         write(unique_filename , 44100, silence)
     else:
         last_text = text
-        unique_filename  = infer(text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale,sid = speaker, reference_audio=None, emotion=emotion)
-        status_code, response_text = send_audio_to_server(unique_filename,text)
-        print(f"Response from server: {response_text} (Status code: {status_code})")
     with open(unique_filename ,'rb') as bit:
         wav_bytes = bit.read()
     os.remove(unique_filename)
@@ -236,14 +325,16 @@ def tts():
             'Text': unique_filename .encode('utf-8')}
     return wav_bytes, 200, headers
 if __name__ == "__main__":
     languages = [ "Auto", "ZH", "JP"]
     modelPaths = []
-    for dirpath, dirnames, filenames in os.walk("Data/BangDreamV22/models/"):
         for filename in filenames:
             modelPaths.append(os.path.join(dirpath, filename))
-    hps = utils.get_hparams_from_file('Data/BangDreamV22/configs/config.json')
     net_g = get_net_g(
         model_path=modelPaths[-1], device=device, hps=hps
     )
@@ -251,4 +342,80 @@ if __name__ == "__main__":
     speakers = list(speaker_ids.keys())
     last_text = ""
     last_model = modelPaths[-1]
-    app.run(host="0.0.0.0", port=5000)

 import logging
 import re_matching
 logging.getLogger("numba").setLevel(logging.WARNING)
 logging.getLogger("markdown_it").setLevel(logging.WARNING)
 )
 logger = logging.getLogger(__name__)
 import librosa
 import numpy as np
 import torch
 from torch.utils.data import Dataset
 from torch.utils.data import DataLoader, Dataset
 from tqdm import tqdm
+from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
+import uuid
+from flask import Flask, request, jsonify, render_template_string
+from flask_cors import CORS
+import gradio as gr
 import utils
 from config import config
 import torch
 import commons
 from text import cleaned_text_to_sequence, get_bert
 from text.cleaner import clean_text
 import utils
 from models import SynthesizerTrn
 from text.symbols import symbols
 import sys
 from scipy.io.wavfile import write
+from threading import Thread
 net_g = None
 device = (
         "cuda:0"
         if torch.cuda.is_available()
         )
     )
+#device = "cpu"
+BandList = {
+        "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
+        "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
+        "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
+        "PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"],
+        "Roselia":["友希那","紗夜","リサ","燐子","あこ"],
+        "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
+        "Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
+        "MyGo":["燈","愛音","そよ","立希","楽奈"],
+        "AveMujica":["祥子","睦","海鈴","にゃむ","初華"],
+        "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"],
+        "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"],
+        "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"],
+        "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
+}
 def get_net_g(model_path: str,  device: str, hps):
     net_g = SynthesizerTrn(
     _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
     return net_g
+def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7):
+    style_text = None if style_text == "" else style_text
     norm_text, phone, tone, word2ph = clean_text(text, language_str)
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
     if hps.data.add_blank:
         phone = commons.intersperse(phone, 0)
         tone = commons.intersperse(tone, 0)
         for i in range(len(word2ph)):
             word2ph[i] = word2ph[i] * 2
         word2ph[0] += 1
+    bert_ori = get_bert(
+        norm_text, word2ph, language_str, device, style_text, style_weight
+    )
     del word2ph
     assert bert_ori.shape[-1] == len(phone), phone
     if language_str == "ZH":
         bert = bert_ori
+        ja_bert = torch.randn(1024, len(phone))
+        en_bert = torch.randn(1024, len(phone))
     elif language_str == "JP":
+        bert = torch.randn(1024, len(phone))
         ja_bert = bert_ori
+        en_bert = torch.randn(1024, len(phone))
+    elif language_str == "EN":
+        bert = torch.randn(1024, len(phone))
+        ja_bert = torch.randn(1024, len(phone))
+        en_bert = bert_ori
     else:
         raise ValueError("language_str should be ZH, JP or EN")
     language = torch.LongTensor(language)
     return bert, ja_bert, en_bert, phone, tone, language
 def infer(
     text,
     sdp_ratio,
     noise_scale_w,
     length_scale,
     sid,
+    style_text=None,
+    style_weight=0.7,
 ):
     language= 'JP' if is_japanese(text) else 'ZH'
     bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
+        text,
+        language,
+        hps,
+        device,
+        style_text=style_text,
+        style_weight=style_weight,
     )
     with torch.no_grad():
         x_tst = phones.to(device).unsqueeze(0)
         ja_bert = ja_bert.to(device).unsqueeze(0)
         en_bert = en_bert.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        # emo = emo.to(device).unsqueeze(0)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
         audio = (
                 bert,
                 ja_bert,
                 en_bert,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
             .float()
             .numpy()
         )
+        del (
+            x_tst,
+            tones,
+            lang_ids,
+            bert,
+            x_tst_lengths,
+            speakers,
+            ja_bert,
+            en_bert,
+        )  # , emo
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
+def inferAPI(
+    text,
+    sdp_ratio,
+    noise_scale,
+    noise_scale_w,
+    length_scale,
+    sid,
+    style_text=None,
+    style_weight=0.7,
+):
+    language= 'JP' if is_japanese(text) else 'ZH'
+    bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
+        text,
+        language,
+        hps,
+        device,
+        style_text=style_text,
+        style_weight=style_weight,
+    )
+    with torch.no_grad():
+        x_tst = phones.to(device).unsqueeze(0)
+        tones = tones.to(device).unsqueeze(0)
+        lang_ids = lang_ids.to(device).unsqueeze(0)
+        bert = bert.to(device).unsqueeze(0)
+        ja_bert = ja_bert.to(device).unsqueeze(0)
+        en_bert = en_bert.to(device).unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        # emo = emo.to(device).unsqueeze(0)
+        del phones
+        speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
+        audio = (
+            net_g.infer(
+                x_tst,
+                x_tst_lengths,
+                speakers,
+                tones,
+                lang_ids,
+                bert,
+                ja_bert,
+                en_bert,
+                sdp_ratio=sdp_ratio,
+                noise_scale=noise_scale,
+                noise_scale_w=noise_scale_w,
+                length_scale=length_scale,
+            )[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )
+        del (
+            x_tst,
+            tones,
+            lang_ids,
+            bert,
+            x_tst_lengths,
+            speakers,
+            ja_bert,
+            en_bert,
+        )  # , emo
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         unique_filename = f"temp{uuid.uuid4()}.wav"
     except:
         return "error"
+Flaskapp = Flask(__name__)
+CORS(Flaskapp)
+@Flaskapp.route('/')
+@Flaskapp.route('/')
 def tts():
     global last_text, last_model
     noise_scale = float(request.args.get('noise_scale', 0.6))
     noise_scale_w = float(request.args.get('noise_scale_w', 0.8))
     length_scale = float(request.args.get('length_scale', 1))
+    style_weight = float(request.args.get('style_weight', 0.7))
+    style_text = request.args.get('style_text', 'happy')
     text = request.args.get('text')
     is_chat = request.args.get('is_chat', 'false').lower() == 'true'
     model = request.args.get('model',modelPaths[-1])
                 <title>TTS API Documentation</title>
             </head>
             <body>
+                <iframe src="http://127.0.0.1:7860" style="width:100%; height:100vh; border:none;"></iframe>
             </body>
             </html>
         """)
         write(unique_filename , 44100, silence)
     else:
         last_text = text
+        unique_filename  = inferAPI(text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale,sid = speaker, style_text=style_text, style_weight=style_weight)
     with open(unique_filename ,'rb') as bit:
         wav_bytes = bit.read()
     os.remove(unique_filename)
             'Text': unique_filename .encode('utf-8')}
     return wav_bytes, 200, headers
+def gradio_interface():
+    return app.launch(share=True)
 if __name__ == "__main__":
     languages = [ "Auto", "ZH", "JP"]
     modelPaths = []
+    for dirpath, dirnames, filenames in os.walk('Data/V23/models/'):
         for filename in filenames:
             modelPaths.append(os.path.join(dirpath, filename))
+    hps = utils.get_hparams_from_file('Data/V23/configs/config.json')
     net_g = get_net_g(
         model_path=modelPaths[-1], device=device, hps=hps
     )
     speakers = list(speaker_ids.keys())
     last_text = ""
     last_model = modelPaths[-1]
+    with gr.Blocks() as app:
+        for band in BandList:
+            with gr.TabItem(band):
+                for name in BandList[band]:
+                    with gr.TabItem(name):
+                        with gr.Row():
+                            with gr.Column():
+                                with gr.Row():
+                                    gr.Markdown(
+                                        '<div align="center">'
+                                        f'<img style="width:auto;height:400px;" src="https://mahiruoshi-bangdream-bert-vits2.hf.space/file/image/{name}.png">'
+                                        '</div>'
+                                    )
+                                length_scale = gr.Slider(
+                                        minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
+                                    )
+                                with gr.Accordion(label="参数设定", open=False):
+                                    sdp_ratio = gr.Slider(
+                                    minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
+                                    )
+                                    noise_scale = gr.Slider(
+                                        minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
+                                    )
+                                    noise_scale_w = gr.Slider(
+                                        minimum=0.1, maximum=2, value=0.667, step=0.01, label="音素长度"
+                                    )
+                                    speaker = gr.Dropdown(
+                                        choices=speakers, value=name, label="说话人"
+                                    )
+                                with gr.Accordion(label="切换模型", open=False):
+                                    modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
+                                    btnMod = gr.Button("载入模型")
+                                    statusa = gr.TextArea()
+                                    btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
+                            with gr.Column():
+                                text = gr.TextArea(
+                                    label="输入纯日语或者中文",
+                                    placeholder="输入纯日语或者中文",
+                                    value="为什么要演奏春日影!",
+                                )
+                                style_text = gr.Textbox(label="辅助文本")
+                                style_weight = gr.Slider(
+                                        minimum=0,
+                                        maximum=1,
+                                        value=0.7,
+                                        step=0.1,
+                                        label="Weight",
+                                        info="主文本和辅助文本的bert混合比率，0表示仅主文本，1表示仅辅助文本",
+                                    )
+                                btn = gr.Button("点击生成", variant="primary")
+                                audio_output = gr.Audio(label="Output Audio")
+                                '''
+                                btntran = gr.Button("快速中翻日")
+                                translateResult = gr.TextArea("从这复制翻译后的文本")
+                                btntran.click(translate, inputs=[text], outputs = [translateResult])
+                                '''
+                    btn.click(
+                        infer,
+                        inputs=[
+                            text,
+                            sdp_ratio,
+                            noise_scale,
+                            noise_scale_w,
+                            length_scale,
+                            speaker,
+                            style_text,
+                            style_weight,
+                        ],
+                        outputs=[audio_output],
+                    )
+    api_thread = Thread(target=Flaskapp.run, args=("0.0.0.0", 5000))
+    gradio_thread = Thread(target=gradio_interface)
+    gradio_thread.start()
+    print("推理页面已开启!")
+    api_thread.start()
+    print("api页面已开启!运行在5000端口")

server_fastapi.py CHANGED Viewed

@@ -5,6 +5,7 @@ import logging
 import gc
 import random
 import gradio
 import numpy as np
 import utils
@@ -203,28 +204,48 @@ if __name__ == "__main__":
         auto_split: bool,
         emotion: Optional[Union[int, str]] = None,
         reference_audio=None,
     ) -> Union[Response, Dict[str, any]]:
         """TTS实现函数"""
         # 检查模型是否存在
         if model_id not in loaded_models.models.keys():
             return {"status": 10, "detail": f"模型model_id={model_id}未加载"}
         # 检查是否提供speaker
         if speaker_name is None and speaker_id is None:
             return {"status": 11, "detail": "请提供speaker_name或speaker_id"}
         elif speaker_name is None:
             # 检查speaker_id是否存在
             if speaker_id not in loaded_models.models[model_id].id2spk.keys():
                 return {"status": 12, "detail": f"角色speaker_id={speaker_id}不存在"}
             speaker_name = loaded_models.models[model_id].id2spk[speaker_id]
         # 检查speaker_name是否存在
         if speaker_name not in loaded_models.models[model_id].spk2id.keys():
             return {"status": 13, "detail": f"角色speaker_name={speaker_name}不存在"}
         if language is None:
             language = loaded_models.models[model_id].language
         if auto_translate:
             text = trans.translate(Sentence=text, to_Language=language.lower())
         if reference_audio is not None:
             ref_audio = BytesIO(await reference_audio.read())
         else:
             ref_audio = reference_audio
         if not auto_split:
@@ -242,6 +263,8 @@ if __name__ == "__main__":
                     device=loaded_models.models[model_id].device,
                     emotion=emotion,
                     reference_audio=ref_audio,
                 )
                 audio = gradio.processing_utils.convert_to_16_bit_wav(audio)
         else:
@@ -263,6 +286,8 @@ if __name__ == "__main__":
                             device=loaded_models.models[model_id].device,
                             emotion=emotion,
                             reference_audio=ref_audio,
                         )
                     )
                     audios.append(np.zeros(int(44100 * 0.2)))
@@ -293,6 +318,8 @@ if __name__ == "__main__":
         auto_split: bool = Query(False, description="自动切分"),
         emotion: Optional[Union[int, str]] = Query(None, description="emo"),
         reference_audio: UploadFile = File(None),
     ):
         """语音接口，若需要上传参考音频请仅使用post请求"""
         logger.info(
@@ -312,6 +339,8 @@ if __name__ == "__main__":
             auto_split=auto_split,
             emotion=emotion,
             reference_audio=reference_audio,
         )
     @app.get("/voice")
@@ -331,6 +360,8 @@ if __name__ == "__main__":
         auto_translate: bool = Query(False, description="自动翻译"),
         auto_split: bool = Query(False, description="自动切分"),
         emotion: Optional[Union[int, str]] = Query(None, description="emo"),
     ):
         """语音接口"""
         logger.info(
@@ -349,6 +380,8 @@ if __name__ == "__main__":
             auto_translate=auto_translate,
             auto_split=auto_split,
             emotion=emotion,
         )
     @app.get("/models/info")
@@ -370,7 +403,9 @@ if __name__ == "__main__":
         )
         result = loaded_models.del_model(model_id)
         if result is None:
             return {"status": 14, "detail": f"模型{model_id}不存在，删除失败"}
         return {"status": 0, "detail": "删除成功"}
     @app.get("/models/add")
@@ -394,6 +429,7 @@ if __name__ == "__main__":
             elif os.path.isfile(os.path.join(model_dir, "../config.json")):
                 config_path = os.path.join(model_dir, "../config.json")
             else:
                 return {
                     "status": 15,
                     "detail": "查询未传入配置文件路径，同时默认路径./与../中不存在配置文件config.json。",
@@ -628,8 +664,10 @@ if __name__ == "__main__":
             f"{request.client.host}:{request.client.port}/tools/get_audio  { unquote(str(request.query_params) )}"
         )
         if not os.path.isfile(path):
             return {"status": 18, "detail": "指定音频不存在"}
-        if not path.endswith(".wav"):
             return {"status": 19, "detail": "非wav格式文件"}
         return FileResponse(path=path)

 import gc
 import random
+import librosa
 import gradio
 import numpy as np
 import utils
         auto_split: bool,
         emotion: Optional[Union[int, str]] = None,
         reference_audio=None,
+        style_text: Optional[str] = None,
+        style_weight: float = 0.7,
     ) -> Union[Response, Dict[str, any]]:
         """TTS实现函数"""
         # 检查模型是否存在
         if model_id not in loaded_models.models.keys():
+            logger.error(f"/voice 请求错误：模型model_id={model_id}未加载")
             return {"status": 10, "detail": f"模型model_id={model_id}未加载"}
         # 检查是否提供speaker
         if speaker_name is None and speaker_id is None:
+            logger.error("/voice 请求错误：推理请求未提供speaker_name或speaker_id")
             return {"status": 11, "detail": "请提供speaker_name或speaker_id"}
         elif speaker_name is None:
             # 检查speaker_id是否存在
             if speaker_id not in loaded_models.models[model_id].id2spk.keys():
+                logger.error(f"/voice 请求错误：角色speaker_id={speaker_id}不存在")
                 return {"status": 12, "detail": f"角色speaker_id={speaker_id}不存在"}
             speaker_name = loaded_models.models[model_id].id2spk[speaker_id]
         # 检查speaker_name是否存在
         if speaker_name not in loaded_models.models[model_id].spk2id.keys():
+            logger.error(f"/voice 请求错误：角色speaker_name={speaker_name}不存在")
             return {"status": 13, "detail": f"角色speaker_name={speaker_name}不存在"}
+        # 未传入则使用默认语言
         if language is None:
             language = loaded_models.models[model_id].language
+        # 翻译会破坏mix结构，auto也会变得无意义。不要在这两个模式下使用
         if auto_translate:
+            if language == "auto" or language == "mix":
+                logger.error(
+                    f"/voice 请求错误：请勿同时使用language = {language}与auto_translate模式"
+                )
+                return {
+                    "status": 20,
+                    "detail": f"请勿同时使用language = {language}与auto_translate模式",
+                }
             text = trans.translate(Sentence=text, to_Language=language.lower())
         if reference_audio is not None:
             ref_audio = BytesIO(await reference_audio.read())
+            # 2.2 适配
+            if loaded_models.models[model_id].version == "2.2":
+                ref_audio, _ = librosa.load(ref_audio, 48000)
         else:
             ref_audio = reference_audio
         if not auto_split:
                     device=loaded_models.models[model_id].device,
                     emotion=emotion,
                     reference_audio=ref_audio,
+                    style_text=style_text,
+                    style_weight=style_weight,
                 )
                 audio = gradio.processing_utils.convert_to_16_bit_wav(audio)
         else:
                             device=loaded_models.models[model_id].device,
                             emotion=emotion,
                             reference_audio=ref_audio,
+                            style_text=style_text,
+                            style_weight=style_weight,
                         )
                     )
                     audios.append(np.zeros(int(44100 * 0.2)))
         auto_split: bool = Query(False, description="自动切分"),
         emotion: Optional[Union[int, str]] = Query(None, description="emo"),
         reference_audio: UploadFile = File(None),
+        style_text: Optional[str] = Form(None, description="风格文本"),
+        style_weight: float = Query(0.7, description="风格权重"),
     ):
         """语音接口，若需要上传参考音频请仅使用post请求"""
         logger.info(
             auto_split=auto_split,
             emotion=emotion,
             reference_audio=reference_audio,
+            style_text=style_text,
+            style_weight=style_weight,
         )
     @app.get("/voice")
         auto_translate: bool = Query(False, description="自动翻译"),
         auto_split: bool = Query(False, description="自动切分"),
         emotion: Optional[Union[int, str]] = Query(None, description="emo"),
+        style_text: Optional[str] = Query(None, description="风格文本"),
+        style_weight: float = Query(0.7, description="风格权重"),
     ):
         """语音接口"""
         logger.info(
             auto_translate=auto_translate,
             auto_split=auto_split,
             emotion=emotion,
+            style_text=style_text,
+            style_weight=style_weight,
         )
     @app.get("/models/info")
         )
         result = loaded_models.del_model(model_id)
         if result is None:
+            logger.error(f"/models/delete 模型删除错误：模型{model_id}不存在，删除失败")
             return {"status": 14, "detail": f"模型{model_id}不存在，删除失败"}
         return {"status": 0, "detail": "删除成功"}
     @app.get("/models/add")
             elif os.path.isfile(os.path.join(model_dir, "../config.json")):
                 config_path = os.path.join(model_dir, "../config.json")
             else:
+                logger.error("/models/add 模型添加失败：未在模型所在目录以及上级目录找到config.json文件")
                 return {
                     "status": 15,
                     "detail": "查询未传入配置文件路径，同时默认路径./与../中不存在配置文件config.json。",
             f"{request.client.host}:{request.client.port}/tools/get_audio  { unquote(str(request.query_params) )}"
         )
         if not os.path.isfile(path):
+            logger.error(f"/tools/get_audio 获取音频错误：指定音频{path}不存在")
             return {"status": 18, "detail": "指定音频不存在"}
+        if not path.lower().endswith(".wav"):
+            logger.error(f"/tools/get_audio 获取音频错误：音频{path}非wav文件")
             return {"status": 19, "detail": "非wav格式文件"}
         return FileResponse(path=path)

slm/wavlm-base-plus/.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

slm/wavlm-base-plus/README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+---
+language:
+- en
+datasets:
+tags:
+- speech
+inference: false
+---
+# WavLM-Base-Plus
+[Microsoft's WavLM](https://github.com/microsoft/unilm/tree/master/wavlm)
+The base model pretrained on 16kHz sampled speech audio. When using the model, make sure that your speech input is also sampled at 16kHz.
+**Note**: This model does not have a tokenizer as it was pretrained on audio alone. In order to use this model **speech recognition**, a tokenizer should be created and the model should be fine-tuned on labeled text data. Check out [this blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) for more in-detail explanation of how to fine-tune the model.
+The model was pre-trained on:
+- 60,000 hours of [Libri-Light](https://arxiv.org/abs/1912.07875)
+- 10,000 hours of [GigaSpeech](https://arxiv.org/abs/2106.06909)
+- 24,000 hours of [VoxPopuli](https://arxiv.org/abs/2101.00390)
+[Paper: WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900)
+Authors: Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei
+**Abstract**
+*Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker identity, paralinguistics, spoken content, etc., learning universal representations for all speech tasks is challenging. In this paper, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks. WavLM is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity preservation. We first equip the Transformer structure with gated relative position bias to improve its capability on recognition tasks. For better speaker discrimination, we propose an utterance mixing training strategy, where additional overlapped utterances are created unsupervisely and incorporated during model training. Lastly, we scale up the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.*
+The original model can be found under https://github.com/microsoft/unilm/tree/master/wavlm.
+# Usage
+This is an English pre-trained speech model that has to be fine-tuned on a downstream task like speech recognition or audio classification before it can be
+used in inference. The model was pre-trained in English and should therefore perform well only in English. The model has been shown to work well on the [SUPERB benchmark](https://superbbenchmark.org/).
+**Note**: The model was pre-trained on phonemes rather than characters. This means that one should make sure that the input text is converted to a sequence
+of phonemes before fine-tuning.
+## Speech Recognition
+To fine-tune the model for speech recognition, see [the official speech recognition example](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition).
+## Speech Classification
+To fine-tune the model for speech classification, see [the official audio classification example](https://github.com/huggingface/transformers/tree/master/examples/pytorch/audio-classification).
+## Speaker Verification
+TODO
+## Speaker Diarization
+TODO
+# Contribution
+The model was contributed by [cywang](https://huggingface.co/cywang) and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+# License
+The official license can be found [here](https://github.com/microsoft/UniSpeech/blob/main/LICENSE)
+![design](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/wavlm.png)

slm/wavlm-base-plus/config.json ADDED Viewed

	@@ -0,0 +1,99 @@

+{
+  "_name_or_path": "wavlm-base-plus",
+  "activation_dropout": 0.0,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "WavLMModel"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "freeze_feat_extract_train": true,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.05,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wavlm",
+  "no_mask_channel_overlap": false,
+  "no_mask_time_overlap": false,
+  "num_adapter_layers": 3,
+  "num_attention_heads": 12,
+  "num_buckets": 320,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_ctc_classes": 80,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 768,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 256,
+  "replace_prob": 0.5,
+  "torch_dtype": "float32",
+  "transformers_version": "4.13.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer"
+}

slm/wavlm-base-plus/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": false,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

slm/wavlm-base-plus/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb273a6ace99408b50cfc81afdbb7ef2de02da2eab0234e18db608ce692fe51
+size 377617425

text/__init__.py CHANGED Viewed

@@ -18,13 +18,15 @@ def cleaned_text_to_sequence(cleaned_text, tones, language):
     return phones, tones, lang_ids
-def get_bert(norm_text, word2ph, language, device):
     from .chinese_bert import get_bert_feature as zh_bert
     from .english_bert_mock import get_bert_feature as en_bert
     from .japanese_bert import get_bert_feature as jp_bert
     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
-    bert = lang_bert_func_map[language](norm_text, word2ph, device)
     return bert

     return phones, tones, lang_ids
+def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7):
     from .chinese_bert import get_bert_feature as zh_bert
     from .english_bert_mock import get_bert_feature as en_bert
     from .japanese_bert import get_bert_feature as jp_bert
     lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
+    bert = lang_bert_func_map[language](
+        norm_text, word2ph, device, style_text, style_weight
+    )
     return bert

text/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/__init__.cpython-311.pyc and b/text/__pycache__/__init__.cpython-311.pyc differ

text/__pycache__/bert_utils.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/bert_utils.cpython-311.pyc and b/text/__pycache__/bert_utils.cpython-311.pyc differ

text/__pycache__/chinese.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/chinese.cpython-311.pyc and b/text/__pycache__/chinese.cpython-311.pyc differ

text/__pycache__/chinese_bert.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/chinese_bert.cpython-311.pyc and b/text/__pycache__/chinese_bert.cpython-311.pyc differ

text/__pycache__/cleaner.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/cleaner.cpython-311.pyc and b/text/__pycache__/cleaner.cpython-311.pyc differ

text/__pycache__/english.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/english.cpython-311.pyc and b/text/__pycache__/english.cpython-311.pyc differ

text/__pycache__/english_bert_mock.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/english_bert_mock.cpython-311.pyc and b/text/__pycache__/english_bert_mock.cpython-311.pyc differ

text/__pycache__/japanese.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/japanese.cpython-311.pyc and b/text/__pycache__/japanese.cpython-311.pyc differ

text/__pycache__/japanese_bert.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/japanese_bert.cpython-311.pyc and b/text/__pycache__/japanese_bert.cpython-311.pyc differ

text/__pycache__/symbols.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/symbols.cpython-311.pyc and b/text/__pycache__/symbols.cpython-311.pyc differ

text/__pycache__/tone_sandhi.cpython-311.pyc CHANGED Viewed

Binary files a/text/__pycache__/tone_sandhi.cpython-311.pyc and b/text/__pycache__/tone_sandhi.cpython-311.pyc differ

text/chinese_bert.py CHANGED Viewed

@@ -12,7 +12,13 @@ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
 models = dict()
-def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
     if (
         sys.platform == "darwin"
         and torch.backends.mps.is_available()
@@ -29,12 +35,24 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
             inputs[i] = inputs[i].to(device)
         res = models[device](**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
     assert len(word2ph) == len(text) + 2
     word2phone = word2ph
     phone_level_feature = []
     for i in range(len(word2phone)):
-        repeat_feature = res[i].repeat(word2phone[i], 1)
         phone_level_feature.append(repeat_feature)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)

 models = dict()
+def get_bert_feature(
+    text,
+    word2ph,
+    device=config.bert_gen_config.device,
+    style_text=None,
+    style_weight=0.7,
+):
     if (
         sys.platform == "darwin"
         and torch.backends.mps.is_available()
             inputs[i] = inputs[i].to(device)
         res = models[device](**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
+        if style_text:
+            style_inputs = tokenizer(style_text, return_tensors="pt")
+            for i in style_inputs:
+                style_inputs[i] = style_inputs[i].to(device)
+            style_res = models[device](**style_inputs, output_hidden_states=True)
+            style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
+            style_res_mean = style_res.mean(0)
     assert len(word2ph) == len(text) + 2
     word2phone = word2ph
     phone_level_feature = []
     for i in range(len(word2phone)):
+        if style_text:
+            repeat_feature = (
+                res[i].repeat(word2phone[i], 1) * (1 - style_weight)
+                + style_res_mean.repeat(word2phone[i], 1) * style_weight
+            )
+        else:
+            repeat_feature = res[i].repeat(word2phone[i], 1)
         phone_level_feature.append(repeat_feature)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)

text/cleaner.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from text import chinese, japanese, cleaned_text_to_sequence
-language_module_map = {"ZH": chinese, "JP": japanese}
 def clean_text(text, language):

+from text import chinese, japanese, english, cleaned_text_to_sequence
+language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
 def clean_text(text, language):

text/english.py CHANGED Viewed

@@ -5,6 +5,7 @@ from g2p_en import G2p
 from transformers import DebertaV2Tokenizer
 from text import symbols
 current_file_path = os.path.dirname(__file__)
 CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
@@ -217,6 +218,8 @@ def refine_ph(phn):
     if re.search(r"\d$", phn):
         tone = int(phn[-1]) + 1
         phn = phn[:-1]
     return phn.lower(), tone
@@ -389,45 +392,84 @@ def sep_text(text):
     return words
 def g2p(text):
     phones = []
     tones = []
-    # word2ph = []
-    words = sep_text(text)
-    tokens = [tokenizer.tokenize(i) for i in words]
     for word in words:
-        if word.upper() in eng_dict:
-            phns, tns = refine_syllables(eng_dict[word.upper()])
-            phones.append([post_replace_ph(i) for i in phns])
-            tones.append(tns)
-            # word2ph.append(len(phns))
-        else:
-            phone_list = list(filter(lambda p: p != " ", _g2p(word)))
-            phns = []
-            tns = []
-            for ph in phone_list:
-                if ph in arpa:
-                    ph, tn = refine_ph(ph)
-                    phns.append(ph)
-                    tns.append(tn)
-                else:
-                    phns.append(ph)
-                    tns.append(0)
-            phones.append([post_replace_ph(i) for i in phns])
-            tones.append(tns)
-            # word2ph.append(len(phns))
-    # phones = [post_replace_ph(i) for i in phones]
     word2ph = []
-    for token, phoneme in zip(tokens, phones):
-        phone_len = len(phoneme)
         word_len = len(token)
-        aaa = distribute_phone(phone_len, word_len)
         word2ph += aaa
-    phones = ["_"] + [j for i in phones for j in i] + ["_"]
-    tones = [0] + [j for i in tones for j in i] + [0]
     word2ph = [1] + word2ph + [1]
     assert len(phones) == len(tones), text
     assert len(phones) == sum(word2ph), text

 from transformers import DebertaV2Tokenizer
 from text import symbols
+from text.symbols import punctuation
 current_file_path = os.path.dirname(__file__)
 CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
     if re.search(r"\d$", phn):
         tone = int(phn[-1]) + 1
         phn = phn[:-1]
+    else:
+        tone = 3
     return phn.lower(), tone
     return words
+def text_to_words(text):
+    tokens = tokenizer.tokenize(text)
+    words = []
+    for idx, t in enumerate(tokens):
+        if t.startswith("▁"):
+            words.append([t[1:]])
+        else:
+            if t in punctuation:
+                if idx == len(tokens) - 1:
+                    words.append([f"{t}"])
+                else:
+                    if (
+                        not tokens[idx + 1].startswith("▁")
+                        and tokens[idx + 1] not in punctuation
+                    ):
+                        if idx == 0:
+                            words.append([])
+                        words[-1].append(f"{t}")
+                    else:
+                        words.append([f"{t}"])
+            else:
+                if idx == 0:
+                    words.append([])
+                words[-1].append(f"{t}")
+    return words
 def g2p(text):
     phones = []
     tones = []
+    phone_len = []
+    # words = sep_text(text)
+    # tokens = [tokenizer.tokenize(i) for i in words]
+    words = text_to_words(text)
     for word in words:
+        temp_phones, temp_tones = [], []
+        if len(word) > 1:
+            if "'" in word:
+                word = ["".join(word)]
+        for w in word:
+            if w in punctuation:
+                temp_phones.append(w)
+                temp_tones.append(0)
+                continue
+            if w.upper() in eng_dict:
+                phns, tns = refine_syllables(eng_dict[w.upper()])
+                temp_phones += [post_replace_ph(i) for i in phns]
+                temp_tones += tns
+                # w2ph.append(len(phns))
+            else:
+                phone_list = list(filter(lambda p: p != " ", _g2p(w)))
+                phns = []
+                tns = []
+                for ph in phone_list:
+                    if ph in arpa:
+                        ph, tn = refine_ph(ph)
+                        phns.append(ph)
+                        tns.append(tn)
+                    else:
+                        phns.append(ph)
+                        tns.append(0)
+                temp_phones += [post_replace_ph(i) for i in phns]
+                temp_tones += tns
+        phones += temp_phones
+        tones += temp_tones
+        phone_len.append(len(temp_phones))
+        # phones = [post_replace_ph(i) for i in phones]
     word2ph = []
+    for token, pl in zip(words, phone_len):
         word_len = len(token)
+        aaa = distribute_phone(pl, word_len)
         word2ph += aaa
+    phones = ["_"] + phones + ["_"]
+    tones = [0] + tones + [0]
     word2ph = [1] + word2ph + [1]
     assert len(phones) == len(tones), text
     assert len(phones) == sum(word2ph), text

text/english_bert_mock.py CHANGED Viewed

@@ -13,7 +13,13 @@ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
 models = dict()
-def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
     if (
         sys.platform == "darwin"
         and torch.backends.mps.is_available()
@@ -30,11 +36,24 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
             inputs[i] = inputs[i].to(device)
         res = models[device](**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
     assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
     word2phone = word2ph
     phone_level_feature = []
     for i in range(len(word2phone)):
-        repeat_feature = res[i].repeat(word2phone[i], 1)
         phone_level_feature.append(repeat_feature)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)

 models = dict()
+def get_bert_feature(
+    text,
+    word2ph,
+    device=config.bert_gen_config.device,
+    style_text=None,
+    style_weight=0.7,
+):
     if (
         sys.platform == "darwin"
         and torch.backends.mps.is_available()
             inputs[i] = inputs[i].to(device)
         res = models[device](**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
+        if style_text:
+            style_inputs = tokenizer(style_text, return_tensors="pt")
+            for i in style_inputs:
+                style_inputs[i] = style_inputs[i].to(device)
+            style_res = models[device](**style_inputs, output_hidden_states=True)
+            style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
+            style_res_mean = style_res.mean(0)
     assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
     word2phone = word2ph
     phone_level_feature = []
     for i in range(len(word2phone)):
+        if style_text:
+            repeat_feature = (
+                res[i].repeat(word2phone[i], 1) * (1 - style_weight)
+                + style_res_mean.repeat(word2phone[i], 1) * style_weight
+            )
+        else:
+            repeat_feature = res[i].repeat(word2phone[i], 1)
         phone_level_feature.append(repeat_feature)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)

text/japanese_bert.py CHANGED Viewed

@@ -13,8 +13,16 @@ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
 models = dict()
-def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
     text = "".join(text2sep_kata(text)[0])
     if (
         sys.platform == "darwin"
         and torch.backends.mps.is_available()
@@ -31,12 +39,25 @@ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
             inputs[i] = inputs[i].to(device)
         res = models[device](**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
     assert len(word2ph) == len(text) + 2
     word2phone = word2ph
     phone_level_feature = []
     for i in range(len(word2phone)):
-        repeat_feature = res[i].repeat(word2phone[i], 1)
         phone_level_feature.append(repeat_feature)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)

 models = dict()
+def get_bert_feature(
+    text,
+    word2ph,
+    device=config.bert_gen_config.device,
+    style_text=None,
+    style_weight=0.7,
+):
     text = "".join(text2sep_kata(text)[0])
+    if style_text:
+        style_text = "".join(text2sep_kata(style_text)[0])
     if (
         sys.platform == "darwin"
         and torch.backends.mps.is_available()
             inputs[i] = inputs[i].to(device)
         res = models[device](**inputs, output_hidden_states=True)
         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
+        if style_text:
+            style_inputs = tokenizer(style_text, return_tensors="pt")
+            for i in style_inputs:
+                style_inputs[i] = style_inputs[i].to(device)
+            style_res = models[device](**style_inputs, output_hidden_states=True)
+            style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
+            style_res_mean = style_res.mean(0)
     assert len(word2ph) == len(text) + 2
     word2phone = word2ph
     phone_level_feature = []
     for i in range(len(word2phone)):
+        if style_text:
+            repeat_feature = (
+                res[i].repeat(word2phone[i], 1) * (1 - style_weight)
+                + style_res_mean.repeat(word2phone[i], 1) * style_weight
+            )
+        else:
+            repeat_feature = res[i].repeat(word2phone[i], 1)
         phone_level_feature.append(repeat_feature)
     phone_level_feature = torch.cat(phone_level_feature, dim=0)

text/tone_sandhi.py CHANGED Viewed

@@ -634,9 +634,11 @@ class ToneSandhi:
     # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
     # output seg: [['听一听', 'v']]
     def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        new_seg = []
         # function 1
-        for i, (word, pos) in enumerate(seg):
             if (
                 i - 1 >= 0
                 and word == "一"
@@ -645,6 +647,7 @@ class ToneSandhi:
                 and seg[i - 1][1] == "v"
             ):
                 new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
             else:
                 if (
                     i - 2 >= 0
@@ -655,7 +658,8 @@ class ToneSandhi:
                     continue
                 else:
                     new_seg.append([word, pos])
-        seg = new_seg
         new_seg = []
         # function 2
         for i, (word, pos) in enumerate(seg):

     # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
     # output seg: [['听一听', 'v']]
     def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = [] * len(seg)
         # function 1
+        i = 0
+        while i < len(seg):
+            word, pos = seg[i]
             if (
                 i - 1 >= 0
                 and word == "一"
                 and seg[i - 1][1] == "v"
             ):
                 new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
+                i += 2
             else:
                 if (
                     i - 2 >= 0
                     continue
                 else:
                     new_seg.append([word, pos])
+                i += 1
+        seg = [i for i in new_seg if len(i) > 0]
         new_seg = []
         # function 2
         for i, (word, pos) in enumerate(seg):

tools/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/tools/__pycache__/__init__.cpython-311.pyc and b/tools/__pycache__/__init__.cpython-311.pyc differ

tools/__pycache__/classify_language.cpython-311.pyc CHANGED Viewed

Binary files a/tools/__pycache__/classify_language.cpython-311.pyc and b/tools/__pycache__/classify_language.cpython-311.pyc differ

tools/__pycache__/log.cpython-311.pyc ADDED Viewed

Binary file (547 Bytes). View file

tools/__pycache__/sentence.cpython-311.pyc CHANGED Viewed

Binary files a/tools/__pycache__/sentence.cpython-311.pyc and b/tools/__pycache__/sentence.cpython-311.pyc differ

tools/__pycache__/translate.cpython-311.pyc CHANGED Viewed

Binary files a/tools/__pycache__/translate.cpython-311.pyc and b/tools/__pycache__/translate.cpython-311.pyc differ

train_ms.py CHANGED Viewed

@@ -27,8 +27,15 @@ from models import (
     SynthesizerTrn,
     MultiPeriodDiscriminator,
     DurationDiscriminator,
 )
-from losses import generator_loss, discriminator_loss, feature_loss, kl_loss
 from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
 from text.symbols import symbols
@@ -42,7 +49,6 @@ torch.backends.cuda.enable_flash_sdp(True)
 torch.backends.cuda.enable_mem_efficient_sdp(
     True
 )  # Not available if torch version is lower than 2.0
-torch.backends.cuda.enable_math_sdp(True)
 global_step = 0
@@ -173,6 +179,8 @@ def run():
             0.1,
             gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0,
         ).cuda(local_rank)
     if (
         "use_spk_conditioned_encoder" in hps.model.keys()
         and hps.model.use_spk_conditioned_encoder is True
@@ -210,6 +218,9 @@ def run():
             param.requires_grad = False
     net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(local_rank)
     optim_g = torch.optim.AdamW(
         filter(lambda p: p.requires_grad, net_g.parameters()),
         hps.train.learning_rate,
@@ -222,6 +233,12 @@ def run():
         betas=hps.train.betas,
         eps=hps.train.eps,
     )
     if net_dur_disc is not None:
         optim_dur_disc = torch.optim.AdamW(
             net_dur_disc.parameters(),
@@ -233,12 +250,11 @@ def run():
         optim_dur_disc = None
     net_g = DDP(net_g, device_ids=[local_rank], bucket_cap_mb=512)
     net_d = DDP(net_d, device_ids=[local_rank], bucket_cap_mb=512)
-    dur_resume_lr = None
     if net_dur_disc is not None:
         net_dur_disc = DDP(
             net_dur_disc,
             device_ids=[local_rank],
-            find_unused_parameters=True,
             bucket_cap_mb=512,
         )
@@ -250,9 +266,10 @@ def run():
             token=config.openi_token,
             mirror=config.mirror,
         )
-    try:
-        if net_dur_disc is not None:
             _, _, dur_resume_lr, epoch_str = utils.load_checkpoint(
                 utils.latest_checkpoint_path(hps.model_dir, "DUR_*.pth"),
                 net_dur_disc,
@@ -261,28 +278,32 @@ def run():
                 if "skip_optimizer" in hps.train
                 else True,
             )
-            _, optim_g, g_resume_lr, epoch_str = utils.load_checkpoint(
-                utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"),
-                net_g,
-                optim_g,
-                skip_optimizer=hps.train.skip_optimizer
-                if "skip_optimizer" in hps.train
-                else True,
-            )
-            _, optim_d, d_resume_lr, epoch_str = utils.load_checkpoint(
-                utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"),
-                net_d,
-                optim_d,
-                skip_optimizer=hps.train.skip_optimizer
-                if "skip_optimizer" in hps.train
-                else True,
-            )
-            if not optim_g.param_groups[0].get("initial_lr"):
-                optim_g.param_groups[0]["initial_lr"] = g_resume_lr
-            if not optim_d.param_groups[0].get("initial_lr"):
-                optim_d.param_groups[0]["initial_lr"] = d_resume_lr
             if not optim_dur_disc.param_groups[0].get("initial_lr"):
                 optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr
         epoch_str = max(epoch_str, 1)
         # global_step = (epoch_str - 1) * len(train_loader)
@@ -297,21 +318,43 @@ def run():
         epoch_str = 1
         global_step = 0
     scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
         optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
     )
     scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
         optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
     )
     if net_dur_disc is not None:
-        if not optim_dur_disc.param_groups[0].get("initial_lr"):
-            optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr
         scheduler_dur_disc = torch.optim.lr_scheduler.ExponentialLR(
             optim_dur_disc, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
         )
     else:
         scheduler_dur_disc = None
-    scaler = GradScaler(enabled=hps.train.fp16_run)
     for epoch in range(epoch_str, hps.train.epochs + 1):
         if rank == 0:
@@ -320,9 +363,9 @@ def run():
                 local_rank,
                 epoch,
                 hps,
-                [net_g, net_d, net_dur_disc],
-                [optim_g, optim_d, optim_dur_disc],
-                [scheduler_g, scheduler_d, scheduler_dur_disc],
                 scaler,
                 [train_loader, eval_loader],
                 logger,
@@ -334,9 +377,9 @@ def run():
                 local_rank,
                 epoch,
                 hps,
-                [net_g, net_d, net_dur_disc],
-                [optim_g, optim_d, optim_dur_disc],
-                [scheduler_g, scheduler_d, scheduler_dur_disc],
                 scaler,
                 [train_loader, None],
                 None,
@@ -344,6 +387,7 @@ def run():
             )
         scheduler_g.step()
         scheduler_d.step()
         if net_dur_disc is not None:
             scheduler_dur_disc.step()
@@ -361,9 +405,9 @@ def train_and_evaluate(
     logger,
     writers,
 ):
-    net_g, net_d, net_dur_disc = nets
-    optim_g, optim_d, optim_dur_disc = optims
-    scheduler_g, scheduler_d, scheduler_dur_disc = schedulers
     train_loader, eval_loader = loaders
     if writers is not None:
         writer, writer_eval = writers
@@ -373,6 +417,7 @@ def train_and_evaluate(
     net_g.train()
     net_d.train()
     if net_dur_disc is not None:
         net_dur_disc.train()
     for batch_idx, (
@@ -388,7 +433,6 @@ def train_and_evaluate(
         bert,
         ja_bert,
         en_bert,
-        emo,
     ) in enumerate(tqdm(train_loader)):
         if net_g.module.use_noise_scaled_mas:
             current_mas_noise_scale = (
@@ -411,9 +455,8 @@ def train_and_evaluate(
         bert = bert.cuda(local_rank, non_blocking=True)
         ja_bert = ja_bert.cuda(local_rank, non_blocking=True)
         en_bert = en_bert.cuda(local_rank, non_blocking=True)
-        emo = emo.cuda(local_rank, non_blocking=True)
-        with autocast(enabled=hps.train.fp16_run):
             (
                 y_hat,
                 l_length,
@@ -422,9 +465,8 @@ def train_and_evaluate(
                 x_mask,
                 z_mask,
                 (z, z_p, m_p, logs_p, m_q, logs_q),
-                (hidden_x, logw, logw_),
                 g,
-                loss_commit,
             ) = net_g(
                 x,
                 x_lengths,
@@ -436,7 +478,6 @@ def train_and_evaluate(
                 bert,
                 ja_bert,
                 en_bert,
-                emo,
             )
             mel = spec_to_mel_torch(
                 spec,
@@ -450,7 +491,7 @@ def train_and_evaluate(
                 mel, ids_slice, hps.train.segment_size // hps.data.hop_length
             )
             y_hat_mel = mel_spectrogram_torch(
-                y_hat.squeeze(1),
                 hps.data.filter_length,
                 hps.data.n_mel_channels,
                 hps.data.sampling_rate,
@@ -466,7 +507,7 @@ def train_and_evaluate(
             # Discriminator
             y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
-            with autocast(enabled=False):
                 loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
                     y_d_hat_r, y_d_hat_g
                 )
@@ -475,11 +516,20 @@ def train_and_evaluate(
                 y_dur_hat_r, y_dur_hat_g = net_dur_disc(
                     hidden_x.detach(),
                     x_mask.detach(),
                     logw.detach(),
                     logw_.detach(),
                     g.detach(),
                 )
-                with autocast(enabled=False):
                     # TODO: I think need to mean using the mask, but for now, just mean all
                     (
                         loss_dur_disc,
@@ -490,31 +540,60 @@ def train_and_evaluate(
                 optim_dur_disc.zero_grad()
                 scaler.scale(loss_dur_disc_all).backward()
                 scaler.unscale_(optim_dur_disc)
-                commons.clip_grad_value_(net_dur_disc.parameters(), None)
                 scaler.step(optim_dur_disc)
         optim_d.zero_grad()
         scaler.scale(loss_disc_all).backward()
         scaler.unscale_(optim_d)
         grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
         scaler.step(optim_d)
-        with autocast(enabled=hps.train.fp16_run):
             # Generator
             y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
             if net_dur_disc is not None:
-                y_dur_hat_r, y_dur_hat_g = net_dur_disc(
-                    hidden_x, x_mask, logw, logw_, g
-                )
-            with autocast(enabled=False):
                 loss_dur = torch.sum(l_length.float())
                 loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
                 loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
                 loss_fm = feature_loss(fmap_r, fmap_g)
                 loss_gen, losses_gen = generator_loss(y_d_hat_g)
                 loss_gen_all = (
-                    loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + loss_commit
                 )
                 if net_dur_disc is not None:
                     loss_dur_gen, losses_dur_gen = generator_loss(y_dur_hat_g)
@@ -522,6 +601,8 @@ def train_and_evaluate(
         optim_g.zero_grad()
         scaler.scale(loss_gen_all).backward()
         scaler.unscale_(optim_g)
         grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
         scaler.step(optim_g)
         scaler.update()
@@ -540,9 +621,12 @@ def train_and_evaluate(
                 scalar_dict = {
                     "loss/g/total": loss_gen_all,
                     "loss/d/total": loss_disc_all,
                     "learning_rate": lr,
                     "grad_norm_d": grad_norm_d,
                     "grad_norm_g": grad_norm_g,
                 }
                 scalar_dict.update(
                     {
@@ -550,6 +634,8 @@ def train_and_evaluate(
                         "loss/g/mel": loss_mel,
                         "loss/g/dur": loss_dur,
                         "loss/g/kl": loss_kl,
                     }
                 )
                 scalar_dict.update(
@@ -562,6 +648,30 @@ def train_and_evaluate(
                     {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
                 )
                 image_dict = {
                     "slice/mel_org": utils.plot_spectrogram_to_numpy(
                         y_mel[0].data.cpu().numpy()
@@ -599,6 +709,13 @@ def train_and_evaluate(
                     epoch,
                     os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
                 )
                 if net_dur_disc is not None:
                     utils.save_checkpoint(
                         net_dur_disc,
@@ -642,7 +759,6 @@ def evaluate(hps, generator, eval_loader, writer_eval):
             bert,
             ja_bert,
             en_bert,
-            emo,
         ) in enumerate(eval_loader):
             x, x_lengths = x.cuda(), x_lengths.cuda()
             spec, spec_lengths = spec.cuda(), spec_lengths.cuda()
@@ -653,7 +769,6 @@ def evaluate(hps, generator, eval_loader, writer_eval):
             en_bert = en_bert.cuda()
             tone = tone.cuda()
             language = language.cuda()
-            emo = emo.cuda()
             for use_sdp in [True, False]:
                 y_hat, attn, mask, *_ = generator.module.infer(
                     x,
@@ -664,7 +779,6 @@ def evaluate(hps, generator, eval_loader, writer_eval):
                     bert,
                     ja_bert,
                     en_bert,
-                    emo,
                     y=spec,
                     max_len=1000,
                     sdp_ratio=0.0 if not use_sdp else 1.0,

     SynthesizerTrn,
     MultiPeriodDiscriminator,
     DurationDiscriminator,
+    WavLMDiscriminator,
+)
+from losses import (
+    generator_loss,
+    discriminator_loss,
+    feature_loss,
+    kl_loss,
+    WavLMLoss,
 )
 from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
 from text.symbols import symbols
 torch.backends.cuda.enable_mem_efficient_sdp(
     True
 )  # Not available if torch version is lower than 2.0
 global_step = 0
             0.1,
             gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0,
         ).cuda(local_rank)
+    else:
+        net_dur_disc = None
     if (
         "use_spk_conditioned_encoder" in hps.model.keys()
         and hps.model.use_spk_conditioned_encoder is True
             param.requires_grad = False
     net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(local_rank)
+    net_wd = WavLMDiscriminator(
+        hps.model.slm.hidden, hps.model.slm.nlayers, hps.model.slm.initial_channel
+    ).cuda(local_rank)
     optim_g = torch.optim.AdamW(
         filter(lambda p: p.requires_grad, net_g.parameters()),
         hps.train.learning_rate,
         betas=hps.train.betas,
         eps=hps.train.eps,
     )
+    optim_wd = torch.optim.AdamW(
+        net_wd.parameters(),
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps,
+    )
     if net_dur_disc is not None:
         optim_dur_disc = torch.optim.AdamW(
             net_dur_disc.parameters(),
         optim_dur_disc = None
     net_g = DDP(net_g, device_ids=[local_rank], bucket_cap_mb=512)
     net_d = DDP(net_d, device_ids=[local_rank], bucket_cap_mb=512)
+    net_wd = DDP(net_wd, device_ids=[local_rank], bucket_cap_mb=512)
     if net_dur_disc is not None:
         net_dur_disc = DDP(
             net_dur_disc,
             device_ids=[local_rank],
             bucket_cap_mb=512,
         )
             token=config.openi_token,
             mirror=config.mirror,
         )
+    dur_resume_lr = hps.train.learning_rate
+    wd_resume_lr = hps.train.learning_rate
+    if net_dur_disc is not None:
+        try:
             _, _, dur_resume_lr, epoch_str = utils.load_checkpoint(
                 utils.latest_checkpoint_path(hps.model_dir, "DUR_*.pth"),
                 net_dur_disc,
                 if "skip_optimizer" in hps.train
                 else True,
             )
             if not optim_dur_disc.param_groups[0].get("initial_lr"):
                 optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr
+        except:
+            print("Initialize dur_disc")
+    try:
+        _, optim_g, g_resume_lr, epoch_str = utils.load_checkpoint(
+            utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"),
+            net_g,
+            optim_g,
+            skip_optimizer=hps.train.skip_optimizer
+            if "skip_optimizer" in hps.train
+            else True,
+        )
+        _, optim_d, d_resume_lr, epoch_str = utils.load_checkpoint(
+            utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"),
+            net_d,
+            optim_d,
+            skip_optimizer=hps.train.skip_optimizer
+            if "skip_optimizer" in hps.train
+            else True,
+        )
+        if not optim_g.param_groups[0].get("initial_lr"):
+            optim_g.param_groups[0]["initial_lr"] = g_resume_lr
+        if not optim_d.param_groups[0].get("initial_lr"):
+            optim_d.param_groups[0]["initial_lr"] = d_resume_lr
         epoch_str = max(epoch_str, 1)
         # global_step = (epoch_str - 1) * len(train_loader)
         epoch_str = 1
         global_step = 0
+    try:
+        _, optim_wd, wd_resume_lr, epoch_str = utils.load_checkpoint(
+            utils.latest_checkpoint_path(hps.model_dir, "WD_*.pth"),
+            net_wd,
+            optim_wd,
+            skip_optimizer=hps.train.skip_optimizer
+            if "skip_optimizer" in hps.train
+            else True,
+        )
+        if not optim_wd.param_groups[0].get("initial_lr"):
+            optim_wd.param_groups[0]["initial_lr"] = wd_resume_lr
+    except Exception as e:
+        print(e)
     scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
         optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
     )
     scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
         optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
     )
+    scheduler_wd = torch.optim.lr_scheduler.ExponentialLR(
+        optim_wd, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
+    )
     if net_dur_disc is not None:
         scheduler_dur_disc = torch.optim.lr_scheduler.ExponentialLR(
             optim_dur_disc, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
         )
     else:
         scheduler_dur_disc = None
+    scaler = GradScaler(enabled=hps.train.bf16_run)
+    wl = WavLMLoss(
+        hps.model.slm.model,
+        net_wd,
+        hps.data.sampling_rate,
+        hps.model.slm.sr,
+    ).to(local_rank)
     for epoch in range(epoch_str, hps.train.epochs + 1):
         if rank == 0:
                 local_rank,
                 epoch,
                 hps,
+                [net_g, net_d, net_dur_disc, net_wd, wl],
+                [optim_g, optim_d, optim_dur_disc, optim_wd],
+                [scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd],
                 scaler,
                 [train_loader, eval_loader],
                 logger,
                 local_rank,
                 epoch,
                 hps,
+                [net_g, net_d, net_dur_disc, net_wd, wl],
+                [optim_g, optim_d, optim_dur_disc, optim_wd],
+                [scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd],
                 scaler,
                 [train_loader, None],
                 None,
             )
         scheduler_g.step()
         scheduler_d.step()
+        scheduler_wd.step()
         if net_dur_disc is not None:
             scheduler_dur_disc.step()
     logger,
     writers,
 ):
+    net_g, net_d, net_dur_disc, net_wd, wl = nets
+    optim_g, optim_d, optim_dur_disc, optim_wd = optims
+    scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd = schedulers
     train_loader, eval_loader = loaders
     if writers is not None:
         writer, writer_eval = writers
     net_g.train()
     net_d.train()
+    net_wd.train()
     if net_dur_disc is not None:
         net_dur_disc.train()
     for batch_idx, (
         bert,
         ja_bert,
         en_bert,
     ) in enumerate(tqdm(train_loader)):
         if net_g.module.use_noise_scaled_mas:
             current_mas_noise_scale = (
         bert = bert.cuda(local_rank, non_blocking=True)
         ja_bert = ja_bert.cuda(local_rank, non_blocking=True)
         en_bert = en_bert.cuda(local_rank, non_blocking=True)
+        with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
             (
                 y_hat,
                 l_length,
                 x_mask,
                 z_mask,
                 (z, z_p, m_p, logs_p, m_q, logs_q),
+                (hidden_x, logw, logw_, logw_sdp),
                 g,
             ) = net_g(
                 x,
                 x_lengths,
                 bert,
                 ja_bert,
                 en_bert,
             )
             mel = spec_to_mel_torch(
                 spec,
                 mel, ids_slice, hps.train.segment_size // hps.data.hop_length
             )
             y_hat_mel = mel_spectrogram_torch(
+                y_hat.squeeze(1).float(),
                 hps.data.filter_length,
                 hps.data.n_mel_channels,
                 hps.data.sampling_rate,
             # Discriminator
             y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
+            with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
                 loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
                     y_d_hat_r, y_d_hat_g
                 )
                 y_dur_hat_r, y_dur_hat_g = net_dur_disc(
                     hidden_x.detach(),
                     x_mask.detach(),
+                    logw_.detach(),
                     logw.detach(),
+                    g.detach(),
+                )
+                y_dur_hat_r_sdp, y_dur_hat_g_sdp = net_dur_disc(
+                    hidden_x.detach(),
+                    x_mask.detach(),
                     logw_.detach(),
+                    logw_sdp.detach(),
                     g.detach(),
                 )
+                y_dur_hat_r = y_dur_hat_r + y_dur_hat_r_sdp
+                y_dur_hat_g = y_dur_hat_g + y_dur_hat_g_sdp
+                with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
                     # TODO: I think need to mean using the mask, but for now, just mean all
                     (
                         loss_dur_disc,
                 optim_dur_disc.zero_grad()
                 scaler.scale(loss_dur_disc_all).backward()
                 scaler.unscale_(optim_dur_disc)
+                # torch.nn.utils.clip_grad_norm_(
+                #     parameters=net_dur_disc.parameters(), max_norm=100
+                # )
+                grad_norm_dur = commons.clip_grad_value_(
+                    net_dur_disc.parameters(), None
+                )
                 scaler.step(optim_dur_disc)
         optim_d.zero_grad()
         scaler.scale(loss_disc_all).backward()
         scaler.unscale_(optim_d)
+        if getattr(hps.train, "bf16_run", False):
+            torch.nn.utils.clip_grad_norm_(parameters=net_d.parameters(), max_norm=200)
         grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
         scaler.step(optim_d)
+        with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
+            loss_slm = wl.discriminator(
+                y.detach().squeeze(), y_hat.detach().squeeze()
+            ).mean()
+        optim_wd.zero_grad()
+        scaler.scale(loss_slm).backward()
+        scaler.unscale_(optim_wd)
+        # torch.nn.utils.clip_grad_norm_(parameters=net_wd.parameters(), max_norm=200)
+        grad_norm_wd = commons.clip_grad_value_(net_wd.parameters(), None)
+        scaler.step(optim_wd)
+        with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
             # Generator
             y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
             if net_dur_disc is not None:
+                _, y_dur_hat_g = net_dur_disc(hidden_x, x_mask, logw_, logw, g)
+                _, y_dur_hat_g_sdp = net_dur_disc(hidden_x, x_mask, logw_, logw_sdp, g)
+                y_dur_hat_g = y_dur_hat_g + y_dur_hat_g_sdp
+            with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
                 loss_dur = torch.sum(l_length.float())
                 loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
                 loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
                 loss_fm = feature_loss(fmap_r, fmap_g)
                 loss_gen, losses_gen = generator_loss(y_d_hat_g)
+                loss_lm = wl(y.detach().squeeze(), y_hat.squeeze()).mean()
+                loss_lm_gen = wl.generator(y_hat.squeeze())
                 loss_gen_all = (
+                    loss_gen
+                    + loss_fm
+                    + loss_mel
+                    + loss_dur
+                    + loss_kl
+                    + loss_lm
+                    + loss_lm_gen
                 )
                 if net_dur_disc is not None:
                     loss_dur_gen, losses_dur_gen = generator_loss(y_dur_hat_g)
         optim_g.zero_grad()
         scaler.scale(loss_gen_all).backward()
         scaler.unscale_(optim_g)
+        if getattr(hps.train, "bf16_run", False):
+            torch.nn.utils.clip_grad_norm_(parameters=net_g.parameters(), max_norm=500)
         grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
         scaler.step(optim_g)
         scaler.update()
                 scalar_dict = {
                     "loss/g/total": loss_gen_all,
                     "loss/d/total": loss_disc_all,
+                    "loss/wd/total": loss_slm,
                     "learning_rate": lr,
                     "grad_norm_d": grad_norm_d,
                     "grad_norm_g": grad_norm_g,
+                    "grad_norm_dur": grad_norm_dur,
+                    "grad_norm_wd": grad_norm_wd,
                 }
                 scalar_dict.update(
                     {
                         "loss/g/mel": loss_mel,
                         "loss/g/dur": loss_dur,
                         "loss/g/kl": loss_kl,
+                        "loss/g/lm": loss_lm,
+                        "loss/g/lm_gen": loss_lm_gen,
                     }
                 )
                 scalar_dict.update(
                     {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
                 )
+                if net_dur_disc is not None:
+                    scalar_dict.update({"loss/dur_disc/total": loss_dur_disc_all})
+                    scalar_dict.update(
+                        {
+                            "loss/dur_disc_g/{}".format(i): v
+                            for i, v in enumerate(losses_dur_disc_g)
+                        }
+                    )
+                    scalar_dict.update(
+                        {
+                            "loss/dur_disc_r/{}".format(i): v
+                            for i, v in enumerate(losses_dur_disc_r)
+                        }
+                    )
+                    scalar_dict.update({"loss/g/dur_gen": loss_dur_gen})
+                    scalar_dict.update(
+                        {
+                            "loss/g/dur_gen_{}".format(i): v
+                            for i, v in enumerate(losses_dur_gen)
+                        }
+                    )
                 image_dict = {
                     "slice/mel_org": utils.plot_spectrogram_to_numpy(
                         y_mel[0].data.cpu().numpy()
                     epoch,
                     os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
                 )
+                utils.save_checkpoint(
+                    net_wd,
+                    optim_wd,
+                    hps.train.learning_rate,
+                    epoch,
+                    os.path.join(hps.model_dir, "WD_{}.pth".format(global_step)),
+                )
                 if net_dur_disc is not None:
                     utils.save_checkpoint(
                         net_dur_disc,
             bert,
             ja_bert,
             en_bert,
         ) in enumerate(eval_loader):
             x, x_lengths = x.cuda(), x_lengths.cuda()
             spec, spec_lengths = spec.cuda(), spec_lengths.cuda()
             en_bert = en_bert.cuda()
             tone = tone.cuda()
             language = language.cuda()
             for use_sdp in [True, False]:
                 y_hat, attn, mask, *_ = generator.module.infer(
                     x,
                     bert,
                     ja_bert,
                     en_bert,
                     y=spec,
                     max_len=1000,
                     sdp_ratio=0.0 if not use_sdp else 1.0,

utils.py CHANGED Viewed

@@ -301,7 +301,11 @@ def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_tim
     to_del = [
         os.path.join(path_to_models, fn)
-        for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep])
     ]
     def del_info(fn):

     to_del = [
         os.path.join(path_to_models, fn)
+        for fn in (
+            x_sorted("G")[:-n_ckpts_to_keep]
+            + x_sorted("D")[:-n_ckpts_to_keep]
+            + x_sorted("WD")[:-n_ckpts_to_keep]
+        )
     ]
     def del_info(fn):

webui.py CHANGED Viewed

@@ -42,6 +42,8 @@ def generate_audio(
     language,
     reference_audio,
     emotion,
     skip_start=False,
     skip_end=False,
 ):
@@ -49,8 +51,8 @@ def generate_audio(
     # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
     with torch.no_grad():
         for idx, piece in enumerate(slices):
-            skip_start = (idx != 0) and skip_start
-            skip_end = (idx != len(slices) - 1) and skip_end
             audio = infer(
                 piece,
                 reference_audio=reference_audio,
@@ -66,10 +68,11 @@ def generate_audio(
                 device=device,
                 skip_start=skip_start,
                 skip_end=skip_end,
             )
             audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
             audio_list.append(audio16bit)
-            # audio_list.append(silence)  # 将静音添加到列表中
     return audio_list
@@ -90,8 +93,8 @@ def generate_audio_multilang(
     # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
     with torch.no_grad():
         for idx, piece in enumerate(slices):
-            skip_start = (idx != 0) and skip_start
-            skip_end = (idx != len(slices) - 1) and skip_end
             audio = infer_multilang(
                 piece,
                 reference_audio=reference_audio,
@@ -110,7 +113,6 @@ def generate_audio_multilang(
             )
             audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
             audio_list.append(audio16bit)
-            # audio_list.append(silence)  # 将静音添加到列表中
     return audio_list
@@ -127,63 +129,50 @@ def tts_split(
     interval_between_sent,
     reference_audio,
     emotion,
 ):
-    if language == "mix":
-        return ("invalid", None)
     while text.find("\n\n") != -1:
         text = text.replace("\n\n", "\n")
     para_list = re_matching.cut_para(text)
     audio_list = []
-    if not cut_by_sent:
-        for idx, p in enumerate(para_list):
-            skip_start = idx != 0
-            skip_end = idx != len(para_list) - 1
-            audio = infer(
                 p,
-                reference_audio=reference_audio,
-                emotion=emotion,
-                sdp_ratio=sdp_ratio,
-                noise_scale=noise_scale,
-                noise_scale_w=noise_scale_w,
-                length_scale=length_scale,
-                sid=speaker,
-                language=language,
-                hps=hps,
-                net_g=net_g,
-                device=device,
-                skip_start=skip_start,
-                skip_end=skip_end,
             )
-            audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
-            audio_list.append(audio16bit)
             silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16)
             audio_list.append(silence)
-    else:
-        for idx, p in enumerate(para_list):
-            skip_start = idx != 0
-            skip_end = idx != len(para_list) - 1
             audio_list_sent = []
             sent_list = re_matching.cut_sent(p)
-            for idx, s in enumerate(sent_list):
-                skip_start = (idx != 0) and skip_start
-                skip_end = (idx != len(sent_list) - 1) and skip_end
-                audio = infer(
                     s,
-                    reference_audio=reference_audio,
-                    emotion=emotion,
-                    sdp_ratio=sdp_ratio,
-                    noise_scale=noise_scale,
-                    noise_scale_w=noise_scale_w,
-                    length_scale=length_scale,
-                    sid=speaker,
-                    language=language,
-                    hps=hps,
-                    net_g=net_g,
-                    device=device,
-                    skip_start=skip_start,
-                    skip_end=skip_end,
                 )
-                audio_list_sent.append(audio)
                 silence = np.zeros((int)(44100 * interval_between_sent))
                 audio_list_sent.append(silence)
             if (interval_between_para - interval_between_sent) > 0:
@@ -196,10 +185,47 @@ def tts_split(
             )  # 对完整句子做音量归一
             audio_list.append(audio16bit)
     audio_concat = np.concatenate(audio_list)
-    return ("Success", (44100, audio_concat))
-def tts_fn(
     text: str,
     speaker,
     sdp_ratio,
@@ -209,15 +235,9 @@ def tts_fn(
     language,
     reference_audio,
     emotion,
-    prompt_mode,
 ):
-    if prompt_mode == "Audio prompt":
-        if reference_audio == None:
-            return ("Invalid audio prompt", None)
-        else:
-            reference_audio = load_audio(reference_audio)[1]
-    else:
-        reference_audio = None
     audio_list = []
     if language == "mix":
         bool_valid, str_valid = re_matching.validate_text(text)
@@ -226,120 +246,40 @@ def tts_fn(
                 hps.data.sampling_rate,
                 np.concatenate([np.zeros(hps.data.sampling_rate // 2)]),
             )
-        result = []
         for slice in re_matching.text_matching(text):
-            _speaker = slice.pop()
-            temp_contant = []
-            temp_lang = []
-            for lang, content in slice:
-                if "|" in content:
-                    temp = []
-                    temp_ = []
-                    for i in content.split("|"):
-                        if i != "":
-                            temp.append([i])
-                            temp_.append([lang])
-                        else:
-                            temp.append([])
-                            temp_.append([])
-                    temp_contant += temp
-                    temp_lang += temp_
-                else:
-                    if len(temp_contant) == 0:
-                        temp_contant.append([])
-                        temp_lang.append([])
-                    temp_contant[-1].append(content)
-                    temp_lang[-1].append(lang)
-            for i, j in zip(temp_lang, temp_contant):
-                result.append([*zip(i, j), _speaker])
-        for i, one in enumerate(result):
-            skip_start = i != 0
-            skip_end = i != len(result) - 1
-            _speaker = one.pop()
-            idx = 0
-            while idx < len(one):
-                text_to_generate = []
-                lang_to_generate = []
-                while True:
-                    lang, content = one[idx]
-                    temp_text = [content]
-                    if len(text_to_generate) > 0:
-                        text_to_generate[-1] += [temp_text.pop(0)]
-                        lang_to_generate[-1] += [lang]
-                    if len(temp_text) > 0:
-                        text_to_generate += [[i] for i in temp_text]
-                        lang_to_generate += [[lang]] * len(temp_text)
-                    if idx + 1 < len(one):
-                        idx += 1
-                    else:
-                        break
-                skip_start = (idx != 0) and skip_start
-                skip_end = (idx != len(one) - 1) and skip_end
-                print(text_to_generate, lang_to_generate)
-                audio_list.extend(
-                    generate_audio_multilang(
-                        text_to_generate,
-                        sdp_ratio,
-                        noise_scale,
-                        noise_scale_w,
-                        length_scale,
-                        _speaker,
-                        lang_to_generate,
-                        reference_audio,
-                        emotion,
-                        skip_start,
-                        skip_end,
-                    )
                 )
-                idx += 1
     elif language.lower() == "auto":
-        for idx, slice in enumerate(text.split("|")):
-            if slice == "":
-                continue
-            skip_start = idx != 0
-            skip_end = idx != len(text.split("|")) - 1
-            sentences_list = split_by_language(
-                slice, target_languages=["zh", "ja", "en"]
             )
-            idx = 0
-            while idx < len(sentences_list):
-                text_to_generate = []
-                lang_to_generate = []
-                while True:
-                    content, lang = sentences_list[idx]
-                    temp_text = [content]
-                    lang = lang.upper()
-                    if lang == "JA":
-                        lang = "JP"
-                    if len(text_to_generate) > 0:
-                        text_to_generate[-1] += [temp_text.pop(0)]
-                        lang_to_generate[-1] += [lang]
-                    if len(temp_text) > 0:
-                        text_to_generate += [[i] for i in temp_text]
-                        lang_to_generate += [[lang]] * len(temp_text)
-                    if idx + 1 < len(sentences_list):
-                        idx += 1
-                    else:
-                        break
-                skip_start = (idx != 0) and skip_start
-                skip_end = (idx != len(sentences_list) - 1) and skip_end
-                print(text_to_generate, lang_to_generate)
-                audio_list.extend(
-                    generate_audio_multilang(
-                        text_to_generate,
-                        sdp_ratio,
-                        noise_scale,
-                        noise_scale_w,
-                        length_scale,
-                        speaker,
-                        lang_to_generate,
-                        reference_audio,
-                        emotion,
-                        skip_start,
-                        skip_end,
-                    )
-                )
-                idx += 1
     else:
         audio_list.extend(
             generate_audio(
@@ -352,13 +292,65 @@ def tts_fn(
                 language,
                 reference_audio,
                 emotion,
             )
         )
     audio_concat = np.concatenate(audio_list)
     return "Success", (hps.data.sampling_rate, audio_concat)
 def load_audio(path):
     audio, sr = librosa.load(path, 48000)
     # audio = librosa.resample(audio, 44100, 48000)
@@ -408,34 +400,37 @@ if __name__ == "__main__":
                 )
                 trans = gr.Button("中翻日", variant="primary")
                 slicer = gr.Button("快速切分", variant="primary")
                 speaker = gr.Dropdown(
                     choices=speakers, value=speakers[0], label="Speaker"
                 )
                 _ = gr.Markdown(
-                    value="提示模式（Prompt mode）：可选文字提示或音频提示，用于生成文字或音频指定风格的声音。\n"
                 )
                 prompt_mode = gr.Radio(
                     ["Text prompt", "Audio prompt"],
                     label="Prompt Mode",
                     value="Text prompt",
                 )
                 text_prompt = gr.Textbox(
                     label="Text prompt",
                     placeholder="用文字描述生成风格。如：Happy",
                     value="Happy",
-                    visible=True,
                 )
                 audio_prompt = gr.Audio(
                     label="Audio prompt", type="filepath", visible=False
                 )
                 sdp_ratio = gr.Slider(
-                    minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
                 )
                 noise_scale = gr.Slider(
                     minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
                 )
                 noise_scale_w = gr.Slider(
-                    minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
                 )
                 length_scale = gr.Slider(
                     minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
@@ -445,6 +440,21 @@ if __name__ == "__main__":
                 )
                 btn = gr.Button("生成音频！", variant="primary")
             with gr.Column():
                 with gr.Row():
                     with gr.Column():
                         interval_between_sent = gr.Slider(
@@ -487,6 +497,8 @@ if __name__ == "__main__":
                 audio_prompt,
                 text_prompt,
                 prompt_mode,
             ],
             outputs=[text_output, audio_output],
         )
@@ -511,6 +523,8 @@ if __name__ == "__main__":
                 interval_between_sent,
                 audio_prompt,
                 text_prompt,
             ],
             outputs=[text_output, audio_output],
         )
@@ -527,6 +541,12 @@ if __name__ == "__main__":
             outputs=[audio_prompt],
         )
     print("推理页面已开启!")
     webbrowser.open(f"http://127.0.0.1:{config.webui_config.port}")
     app.launch(share=config.webui_config.share, server_port=config.webui_config.port)

     language,
     reference_audio,
     emotion,
+    style_text,
+    style_weight,
     skip_start=False,
     skip_end=False,
 ):
     # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
     with torch.no_grad():
         for idx, piece in enumerate(slices):
+            skip_start = idx != 0
+            skip_end = idx != len(slices) - 1
             audio = infer(
                 piece,
                 reference_audio=reference_audio,
                 device=device,
                 skip_start=skip_start,
                 skip_end=skip_end,
+                style_text=style_text,
+                style_weight=style_weight,
             )
             audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
             audio_list.append(audio16bit)
     return audio_list
     # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
     with torch.no_grad():
         for idx, piece in enumerate(slices):
+            skip_start = idx != 0
+            skip_end = idx != len(slices) - 1
             audio = infer_multilang(
                 piece,
                 reference_audio=reference_audio,
             )
             audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
             audio_list.append(audio16bit)
     return audio_list
     interval_between_sent,
     reference_audio,
     emotion,
+    style_text,
+    style_weight,
 ):
     while text.find("\n\n") != -1:
         text = text.replace("\n\n", "\n")
+    text = text.replace("|", "")
     para_list = re_matching.cut_para(text)
+    para_list = [p for p in para_list if p != ""]
     audio_list = []
+    for p in para_list:
+        if not cut_by_sent:
+            audio_list += process_text(
                 p,
+                speaker,
+                sdp_ratio,
+                noise_scale,
+                noise_scale_w,
+                length_scale,
+                language,
+                reference_audio,
+                emotion,
+                style_text,
+                style_weight,
             )
             silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16)
             audio_list.append(silence)
+        else:
             audio_list_sent = []
             sent_list = re_matching.cut_sent(p)
+            sent_list = [s for s in sent_list if s != ""]
+            for s in sent_list:
+                audio_list_sent += process_text(
                     s,
+                    speaker,
+                    sdp_ratio,
+                    noise_scale,
+                    noise_scale_w,
+                    length_scale,
+                    language,
+                    reference_audio,
+                    emotion,
+                    style_text,
+                    style_weight,
                 )
                 silence = np.zeros((int)(44100 * interval_between_sent))
                 audio_list_sent.append(silence)
             if (interval_between_para - interval_between_sent) > 0:
             )  # 对完整句子做音量归一
             audio_list.append(audio16bit)
     audio_concat = np.concatenate(audio_list)
+    return ("Success", (hps.data.sampling_rate, audio_concat))
+def process_mix(slice):
+    _speaker = slice.pop()
+    _text, _lang = [], []
+    for lang, content in slice:
+        content = content.split("|")
+        content = [part for part in content if part != ""]
+        if len(content) == 0:
+            continue
+        if len(_text) == 0:
+            _text = [[part] for part in content]
+            _lang = [[lang] for part in content]
+        else:
+            _text[-1].append(content[0])
+            _lang[-1].append(lang)
+            if len(content) > 1:
+                _text += [[part] for part in content[1:]]
+                _lang += [[lang] for part in content[1:]]
+    return _text, _lang, _speaker
+def process_auto(text):
+    _text, _lang = [], []
+    for slice in text.split("|"):
+        if slice == "":
+            continue
+        temp_text, temp_lang = [], []
+        sentences_list = split_by_language(slice, target_languages=["zh", "ja", "en"])
+        for sentence, lang in sentences_list:
+            if sentence == "":
+                continue
+            temp_text.append(sentence)
+            temp_lang.append(lang.upper())
+        _text.append(temp_text)
+        _lang.append(temp_lang)
+    return _text, _lang
+def process_text(
     text: str,
     speaker,
     sdp_ratio,
     language,
     reference_audio,
     emotion,
+    style_text=None,
+    style_weight=0,
 ):
     audio_list = []
     if language == "mix":
         bool_valid, str_valid = re_matching.validate_text(text)
                 hps.data.sampling_rate,
                 np.concatenate([np.zeros(hps.data.sampling_rate // 2)]),
             )
         for slice in re_matching.text_matching(text):
+            _text, _lang, _speaker = process_mix(slice)
+            if _speaker is None:
+                continue
+            print(f"Text: {_text}\nLang: {_lang}")
+            audio_list.extend(
+                generate_audio_multilang(
+                    _text,
+                    sdp_ratio,
+                    noise_scale,
+                    noise_scale_w,
+                    length_scale,
+                    _speaker,
+                    _lang,
+                    reference_audio,
+                    emotion,
                 )
+            )
     elif language.lower() == "auto":
+        _text, _lang = process_auto(text)
+        print(f"Text: {_text}\nLang: {_lang}")
+        audio_list.extend(
+            generate_audio_multilang(
+                _text,
+                sdp_ratio,
+                noise_scale,
+                noise_scale_w,
+                length_scale,
+                speaker,
+                _lang,
+                reference_audio,
+                emotion,
             )
+        )
     else:
         audio_list.extend(
             generate_audio(
                 language,
                 reference_audio,
                 emotion,
+                style_text,
+                style_weight,
             )
         )
+    return audio_list
+def tts_fn(
+    text: str,
+    speaker,
+    sdp_ratio,
+    noise_scale,
+    noise_scale_w,
+    length_scale,
+    language,
+    reference_audio,
+    emotion,
+    prompt_mode,
+    style_text=None,
+    style_weight=0,
+):
+    if style_text == "":
+        style_text = None
+    if prompt_mode == "Audio prompt":
+        if reference_audio == None:
+            return ("Invalid audio prompt", None)
+        else:
+            reference_audio = load_audio(reference_audio)[1]
+    else:
+        reference_audio = None
+    audio_list = process_text(
+        text,
+        speaker,
+        sdp_ratio,
+        noise_scale,
+        noise_scale_w,
+        length_scale,
+        language,
+        reference_audio,
+        emotion,
+        style_text,
+        style_weight,
+    )
     audio_concat = np.concatenate(audio_list)
     return "Success", (hps.data.sampling_rate, audio_concat)
+def format_utils(text, speaker):
+    _text, _lang = process_auto(text)
+    res = f"[{speaker}]"
+    for lang_s, content_s in zip(_lang, _text):
+        for lang, content in zip(lang_s, content_s):
+            res += f"<{lang.lower()}>{content}"
+        res += "|"
+    return "mix", res[:-1]
 def load_audio(path):
     audio, sr = librosa.load(path, 48000)
     # audio = librosa.resample(audio, 44100, 48000)
                 )
                 trans = gr.Button("中翻日", variant="primary")
                 slicer = gr.Button("快速切分", variant="primary")
+                formatter = gr.Button("检测语言，并整理为 MIX 格式", variant="primary")
                 speaker = gr.Dropdown(
                     choices=speakers, value=speakers[0], label="Speaker"
                 )
                 _ = gr.Markdown(
+                    value="提示模式（Prompt mode）：可选文字提示或音频提示，用于生成文字或音频指定风格的声音。\n",
+                    visible=False,
                 )
                 prompt_mode = gr.Radio(
                     ["Text prompt", "Audio prompt"],
                     label="Prompt Mode",
                     value="Text prompt",
+                    visible=False,
                 )
                 text_prompt = gr.Textbox(
                     label="Text prompt",
                     placeholder="用文字描述生成风格。如：Happy",
                     value="Happy",
+                    visible=False,
                 )
                 audio_prompt = gr.Audio(
                     label="Audio prompt", type="filepath", visible=False
                 )
                 sdp_ratio = gr.Slider(
+                    minimum=0, maximum=1, value=0.5, step=0.1, label="SDP Ratio"
                 )
                 noise_scale = gr.Slider(
                     minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
                 )
                 noise_scale_w = gr.Slider(
+                    minimum=0.1, maximum=2, value=0.9, step=0.1, label="Noise_W"
                 )
                 length_scale = gr.Slider(
                     minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
                 )
                 btn = gr.Button("生成音频！", variant="primary")
             with gr.Column():
+                with gr.Accordion("融合文本语义", open=False):
+                    gr.Markdown(
+                        value="使用辅助文本的语意来辅助生成对话（语言保持与主文本相同）\n\n"
+                        "**注意**：不要使用**指令式文本**（如：开心），要使用**带有强烈情感的文本**（如：我好快乐！！！）\n\n"
+                        "效果较不明确，留空即为不使用该功能"
+                    )
+                    style_text = gr.Textbox(label="辅助文本")
+                    style_weight = gr.Slider(
+                        minimum=0,
+                        maximum=1,
+                        value=0.7,
+                        step=0.1,
+                        label="Weight",
+                        info="主文本和辅助文本的bert混合比率，0表示仅主文本，1表示仅辅助文本",
+                    )
                 with gr.Row():
                     with gr.Column():
                         interval_between_sent = gr.Slider(
                 audio_prompt,
                 text_prompt,
                 prompt_mode,
+                style_text,
+                style_weight,
             ],
             outputs=[text_output, audio_output],
         )
                 interval_between_sent,
                 audio_prompt,
                 text_prompt,
+                style_text,
+                style_weight,
             ],
             outputs=[text_output, audio_output],
         )
             outputs=[audio_prompt],
         )
+        formatter.click(
+            format_utils,
+            inputs=[text, speaker],
+            outputs=[language, text],
+        )
     print("推理页面已开启!")
     webbrowser.open(f"http://127.0.0.1:{config.webui_config.port}")
     app.launch(share=config.webui_config.share, server_port=config.webui_config.port)

webui_preprocess.py CHANGED Viewed

@@ -19,9 +19,9 @@ def generate_config(data_dir, batch_size):
     assert data_dir != "", "数据集名称不能为空"
     start_path, _, train_path, val_path, config_path = get_path(data_dir)
     if os.path.isfile(config_path):
-        config = json.load(open(config_path))
     else:
-        config = json.load(open("configs/config.json"))
     config["data"]["training_files"] = train_path
     config["data"]["validation_files"] = val_path
     config["train"]["batch_size"] = batch_size
@@ -44,7 +44,7 @@ def resample(data_dir):
     in_dir = os.path.join(start_path, "raw")
     out_dir = os.path.join(start_path, "wavs")
     subprocess.run(
-        f"python resample.py "
         f"--sr 44100 "
         f"--in_dir {in_dir} "
         f"--out_dir {out_dir} ",
@@ -60,7 +60,9 @@ def preprocess_text(data_dir):
     with open(lbl_path, "w", encoding="utf-8") as f:
         for line in lines:
             path, spk, language, text = line.strip().split("|")
-            path = os.path.join(start_path, "wavs", os.path.basename(path))
             f.writelines(f"{path}|{spk}|{language}|{text}\n")
     subprocess.run(
         f"python preprocess_text.py "
@@ -83,16 +85,6 @@ def bert_gen(data_dir):
     return "BERT 特征文件生成完成"
-def clap_gen(data_dir):
-    assert data_dir != "", "数据集名称不能为空"
-    _, _, _, _, config_path = get_path(data_dir)
-    subprocess.run(
-        f"python clap_gen.py " f"--config {config_path}",
-        shell=True,
-    )
-    return "CLAP 特征文件生成完成"
 if __name__ == "__main__":
     with gr.Blocks() as app:
         with gr.Row():
@@ -100,13 +92,13 @@ if __name__ == "__main__":
                 _ = gr.Markdown(
                     value="# Bert-VITS2 数据预处理\n"
                     "## 预先准备：\n"
-                    "下载 BERT 和 CLAP 模型：\n"
                     "- [中文 RoBERTa](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)\n"
                     "- [日文 DeBERTa](https://huggingface.co/ku-nlp/deberta-v2-large-japanese-char-wwm)\n"
                     "- [英文 DeBERTa](https://huggingface.co/microsoft/deberta-v3-large)\n"
-                    "- [CLAP](https://huggingface.co/laion/clap-htsat-fused)\n"
                     "\n"
-                    "将 BERT 模型放置到 `bert` 文件夹下，CLAP 模型放置到 `emotional` 文件夹下，覆盖同名文件夹。\n"
                     "\n"
                     "数据准备：\n"
                     "将数据放置在 data 文件夹下，按照如下结构组织：\n"
@@ -156,12 +148,10 @@ if __name__ == "__main__":
                 preprocess_text_btn = gr.Button(value="执行", variant="primary")
                 _ = gr.Markdown(value="## 第四步：生成 BERT 特征文件")
                 bert_gen_btn = gr.Button(value="执行", variant="primary")
-                _ = gr.Markdown(value="## 第五步：生成 CLAP 特征文件")
-                clap_gen_btn = gr.Button(value="执行", variant="primary")
                 _ = gr.Markdown(
                     value="## 训练模型及部署：\n"
                     "修改根目录下的 `config.yml` 中 `dataset_path` 一项为 `data/{你的数据集名称}`\n"
-                    "- 训练：将[预训练模型文件](https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/show_model)（`D_0.pth`、`DUR_0.pth` 和 `G_0.pth`）放到 `data/{你的数据集名称}/models` 文件夹下，执行 `torchrun --nproc_per_node=1 train_ms.py` 命令（多卡运行可参考 `run_MnodesAndMgpus.sh` 中的命令。\n"
                     "- 部署：修改根目录下的 `config.yml` 中 `webui` 下 `model` 一项为 `models/{权重文件名}.pth` （如 G_10000.pth），然后执行 `python webui.py`"
                 )
@@ -171,7 +161,6 @@ if __name__ == "__main__":
         resample_btn.click(resample, inputs=[data_dir], outputs=[info])
         preprocess_text_btn.click(preprocess_text, inputs=[data_dir], outputs=[info])
         bert_gen_btn.click(bert_gen, inputs=[data_dir], outputs=[info])
-        clap_gen_btn.click(clap_gen, inputs=[data_dir], outputs=[info])
     webbrowser.open("http://127.0.0.1:7860")
     app.launch(share=False, server_port=7860)

     assert data_dir != "", "数据集名称不能为空"
     start_path, _, train_path, val_path, config_path = get_path(data_dir)
     if os.path.isfile(config_path):
+        config = json.load(open(config_path, "r", encoding="utf-8"))
     else:
+        config = json.load(open("configs/config.json", "r", encoding="utf-8"))
     config["data"]["training_files"] = train_path
     config["data"]["validation_files"] = val_path
     config["train"]["batch_size"] = batch_size
     in_dir = os.path.join(start_path, "raw")
     out_dir = os.path.join(start_path, "wavs")
     subprocess.run(
+        f"python resample_legacy.py "
         f"--sr 44100 "
         f"--in_dir {in_dir} "
         f"--out_dir {out_dir} ",
     with open(lbl_path, "w", encoding="utf-8") as f:
         for line in lines:
             path, spk, language, text = line.strip().split("|")
+            path = os.path.join(start_path, "wavs", os.path.basename(path)).replace(
+                "\\", "/"
+            )
             f.writelines(f"{path}|{spk}|{language}|{text}\n")
     subprocess.run(
         f"python preprocess_text.py "
     return "BERT 特征文件生成完成"
 if __name__ == "__main__":
     with gr.Blocks() as app:
         with gr.Row():
                 _ = gr.Markdown(
                     value="# Bert-VITS2 数据预处理\n"
                     "## 预先准备：\n"
+                    "下载 BERT 和 WavLM 模型：\n"
                     "- [中文 RoBERTa](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)\n"
                     "- [日文 DeBERTa](https://huggingface.co/ku-nlp/deberta-v2-large-japanese-char-wwm)\n"
                     "- [英文 DeBERTa](https://huggingface.co/microsoft/deberta-v3-large)\n"
+                    "- [WavLM](https://huggingface.co/microsoft/wavlm-base-plus)\n"
                     "\n"
+                    "将 BERT 模型放置到 `bert` 文件夹下，WavLM 模型放置到 `slm` 文件夹下，覆盖同名文件夹。\n"
                     "\n"
                     "数据准备：\n"
                     "将数据放置在 data 文件夹下，按照如下结构组织：\n"
                 preprocess_text_btn = gr.Button(value="执行", variant="primary")
                 _ = gr.Markdown(value="## 第四步：生成 BERT 特征文件")
                 bert_gen_btn = gr.Button(value="执行", variant="primary")
                 _ = gr.Markdown(
                     value="## 训练模型及部署：\n"
                     "修改根目录下的 `config.yml` 中 `dataset_path` 一项为 `data/{你的数据集名称}`\n"
+                    "- 训练：将[预训练模型文件](https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/show_model)（`D_0.pth`、`DUR_0.pth`、`WD_0.pth` 和 `G_0.pth`）放到 `data/{你的数据集名称}/models` 文件夹下，执行 `torchrun --nproc_per_node=1 train_ms.py` 命令（多卡运行可参考 `run_MnodesAndMgpus.sh` 中的命令。\n"
                     "- 部署：修改根目录下的 `config.yml` 中 `webui` 下 `model` 一项为 `models/{权重文件名}.pth` （如 G_10000.pth），然后执行 `python webui.py`"
                 )
         resample_btn.click(resample, inputs=[data_dir], outputs=[info])
         preprocess_text_btn.click(preprocess_text, inputs=[data_dir], outputs=[info])
         bert_gen_btn.click(bert_gen, inputs=[data_dir], outputs=[info])
     webbrowser.open("http://127.0.0.1:7860")
     app.launch(share=False, server_port=7860)