Spaces:

Mahiruoshi
/

MyGO_VIts-bert

Running

App Files Files Community

Mahiruoshi commited on Dec 13, 2023

Commit

e6cd719

1 Parent(s): 45d2fcd

Upload 159 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +17 -0
.gitmodules +0 -0
.pre-commit-config.yaml +4 -4
Data/BangDreamV22/configs/config.json +197 -0
Data/BangDreamV22/models/G_51000.pth +3 -0
README.md +42 -13
app.py +20 -43
bert_gen.py +8 -6
clap_gen.py +64 -0
clap_wrapper.py +49 -0
commons.py +6 -14
compress_model.py +89 -0
config.py +7 -2
config.yml +25 -22
configs/config.json +220 -217
css/custom.css +18 -0
data_utils.py +19 -8
default_config.yml +15 -12
emotional/clap-htsat-fused/.gitattributes +34 -0
emotional/clap-htsat-fused/README.md +107 -0
emotional/clap-htsat-fused/config.json +207 -0
emotional/clap-htsat-fused/merges.txt +0 -0
emotional/clap-htsat-fused/preprocessor_config.json +22 -0
emotional/clap-htsat-fused/pytorch_model.bin +3 -0
emotional/clap-htsat-fused/special_tokens_map.json +15 -0
emotional/clap-htsat-fused/tokenizer.json +0 -0
emotional/clap-htsat-fused/tokenizer_config.json +16 -0
emotional/clap-htsat-fused/vocab.json +0 -0
empty_emo.npy +3 -0
export_onnx.py +4 -48
img/yuyu.png +0 -0
img//345/217/202/346/225/260/350/257/264/346/230/216.png +0 -0
img//345/256/265/345/256/253.png +0 -0
img//345/276/256/344/277/241/345/233/276/347/211/207_20231010105112.png +0 -0
img//347/245/236/351/207/214/347/273/253/345/215/216.png +0 -0
img//347/272/263/350/245/277/345/246/262.png +0 -0
infer.py +381 -0
models.py +66 -35
monotonic_align/__pycache__/__init__.cpython-311.pyc +0 -0
monotonic_align/__pycache__/core.cpython-311.pyc +0 -0
onnx_modules/V200/__init__.py +0 -0
onnx_modules/V200/attentions_onnx.py +378 -0
onnx_modules/V200/models_onnx.py +990 -0
onnx_modules/V200/text/__init__.py +1 -0
onnx_modules/V200/text/bert_utils.py +23 -0
onnx_modules/V200/text/chinese.py +198 -0
onnx_modules/V200/text/chinese_bert.py +101 -0
onnx_modules/V200/text/cleaner.py +28 -0
onnx_modules/V200/text/english.py +362 -0
onnx_modules/V200/text/english_bert_mock.py +42 -0

.gitignore CHANGED Viewed

@@ -166,3 +166,20 @@ cython_debug/
 filelists/*
 !/filelists/esd.list
 data/*

 filelists/*
 !/filelists/esd.list
 data/*
+/*.yml
+!/default_config.yml
+/Web/
+/emotional/*/*.bin
+/bert/*/*.bin
+/bert/*/*.h5
+/bert/*/*.model
+/bert/*/*.safetensors
+/bert/*/*.msgpack
+asr_transcript.py
+extract_list.py
+dataset
+/Data
+Model
+raw/
+logs/
+Data/*

.gitmodules ADDED Viewed

File without changes

.pre-commit-config.yaml CHANGED Viewed

@@ -1,24 +1,24 @@
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
     hooks:
       - id: check-yaml
       - id: end-of-file-fixer
       - id: trailing-whitespace
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.291
     hooks:
       - id: ruff
         args: [ --fix ]
   - repo: https://github.com/psf/black
-    rev: 23.9.1
     hooks:
       - id: black
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.5
     hooks:
       - id: codespell
         files: ^.*\.(py|md|rst|yml)$

 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
     hooks:
       - id: check-yaml
       - id: end-of-file-fixer
       - id: trailing-whitespace
   - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.7
     hooks:
       - id: ruff
         args: [ --fix ]
   - repo: https://github.com/psf/black
+    rev: 23.11.0
     hooks:
       - id: black
   - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.6
     hooks:
       - id: codespell
         files: ^.*\.(py|md|rst|yml)$

Data/BangDreamV22/configs/config.json ADDED Viewed

	@@ -0,0 +1,197 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 3000,
+    "seed": 42,
+    "epochs": 1000,
+    "learning_rate": 0.0002,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 10,
+    "fp16_run": false,
+    "lr_decay": 0.99995,
+    "segment_size": 16384,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "skip_optimizer": true,
+    "freeze_ZH_bert": false,
+    "freeze_JP_bert": false,
+    "freeze_EN_bert": false
+  },
+  "data": {
+    "training_files": "Data/BangDream/filelists/train.list",
+    "validation_files": "Data/BangDream/filelists/val.list",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 99,
+    "cleaned_text": true,
+    "spk2id": {
+      "香澄": 0,
+      "有咲": 1,
+      "沙綾": 2,
+      "りみ": 3,
+      "たえ": 4,
+      "沙綾、りみ、たえ": 5,
+      "三月七1": 6,
+      "紗夜": 7,
+      "ロック": 8,
+      "パレオ": 9,
+      "レイヤ": 10,
+      "チュチュ": 11,
+      "彩": 12,
+      "千聖": 13,
+      "イヴ": 14,
+      "日菜": 15,
+      "麻弥": 16,
+      "蘭": 17,
+      "モカ": 18,
+      "巴": 19,
+      "ひまり": 20,
+      "つぐみ": 21,
+      "はぐみ": 22,
+      "花音": 23,
+      "美咲": 24,
+      "薫": 25,
+      "こころ": 26,
+      "つくし": 27,
+      "七深": 28,
+      "透子": 29,
+      "ましろ": 30,
+      "瑠唯": 31,
+      "友希那": 32,
+      "あこ": 33,
+      "リサ": 34,
+      "燐子": 35,
+      "燈": 36,
+      "愛音": 37,
+      "楽奈": 38,
+      "そよ": 39,
+      "立希": 40,
+      "ますき": 41,
+      "祥子": 42,
+      "睦": 43,
+      "海鈴": 44,
+      "にゃむ": 45,
+      "初華": 46,
+      "華戀": 47,
+      "晶": 48,
+      "光": 49,
+      "未知留": 50,
+      "香子": 51,
+      "雙葉": 52,
+      "真晝": 53,
+      "艾露": 54,
+      "珠緒": 55,
+      "艾露露": 56,
+      "純那": 57,
+      "克洛迪娜": 58,
+      "真矢": 59,
+      "奈奈": 60,
+      "壘": 61,
+      "文": 62,
+      "一愛": 63,
+      "菈樂菲": 64,
+      "司": 65,
+      "美空": 66,
+      "靜羽": 67,
+      "悠悠子": 68,
+      "八千代": 69,
+      "栞": 70,
+      "美帆": 71,
+      "芙蘿菈": 72,
+      "克蕾兒": 73,
+      "安德露": 74,
+      "瑪莉亞貝菈": 75,
+      "克拉迪亞": 76,
+      "桃樂西": 77,
+      "瑪麗安": 78,
+      "八重神子1": 79,
+      "娜塔莎": 80,
+      "宵宫": 81,
+      "派蒙11": 82,
+      "派蒙13": 83,
+      "派蒙3": 84,
+      "派蒙7": 85,
+      "派蒙8": 86,
+      "派蒙9": 87,
+      "派蒙10": 88,
+      "派蒙6": 89,
+      "派蒙4": 90,
+      "派蒙1": 91,
+      "派蒙2": 92,
+      "派蒙15": 93,
+      "派蒙16": 94,
+      "派蒙14": 95,
+      "派蒙12": 96,
+      "派蒙5": 97,
+      "纳西妲1": 98
+    }
+  },
+  "model": {
+    "use_spk_conditioned_encoder": true,
+    "use_noise_scaled_mas": true,
+    "use_mel_posterior_encoder": false,
+    "use_duration_discriminator": true,
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      8,
+      2,
+      2
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "version": "2.2"
+}

Data/BangDreamV22/models/G_51000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:521be4508c8b8b81e81201372cce0ac09cef35ca0f66b3d981f1689a601db3c5
+size 750066550

README.md CHANGED Viewed

@@ -1,13 +1,42 @@
----
-title: Bushiroad BertVIts2 Emotional
-emoji: 📚
-colorFrom: purple
-colorTo: green
-sdk: gradio
-sdk_version: 4.7.1
-app_file: app.py
-pinned: false
-license: openrail
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+<img alt="LOGO" src="https://cdn.jsdelivr.net/gh/fishaudio/fish-diffusion@main/images/logo_512x512.png" width="256" height="256" />
+# Bert-VITS2
+VITS2 Backbone with multilingual bert
+For quick guide, please refer to `webui_preprocess.py`.
+简易教程请参见 `webui_preprocess.py`。
+## 请注意，本项目核心思路来源于[anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 一个非常好的tts项目
+## MassTTS的演示demo为[ai版峰哥锐评峰哥本人,并找回了在金三角失落的腰子](https://www.bilibili.com/video/BV1w24y1c7z9)
+[//]: # (## 本项目与[PlayVoice/vits_chinese]&#40;https://github.com/PlayVoice/vits_chinese&#41; 没有任何关系)
+[//]: # ()
+[//]: # (本仓库来源于之前朋友分享了ai峰哥的视频，本人被其中的效果惊艳，在自己尝试MassTTS以后发现fs在音质方面与vits有一定差距，并且training的pipeline比vits更复杂，因此按照其思路将bert)
+## 成熟的旅行者/开拓者/舰长/博士/sensei/猎魔人/喵喵露/V应当参阅代码自己学习如何训练。
+### 严禁将此项目用于一切违反《中华人民共和国宪法》，《中华人民共和国刑法》，《中华人民共和国治安管理处罚法》和《中华人民共和国民法典》之用途。
+### 严禁用于任何政治相关用途。
+#### Video:https://www.bilibili.com/video/BV1hp4y1K78E
+#### Demo:https://www.bilibili.com/video/BV1TF411k78w
+#### QQ Group：815818430
+## References
++ [anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS)
++ [jaywalnut310/vits](https://github.com/jaywalnut310/vits)
++ [p0p4k/vits2_pytorch](https://github.com/p0p4k/vits2_pytorch)
++ [svc-develop-team/so-vits-svc](https://github.com/svc-develop-team/so-vits-svc)
++ [PaddlePaddle/PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech)
++ [emotional-vits](https://github.com/innnky/emotional-vits)
++ [Bert-VITS2-en](https://github.com/xwan07017/Bert-VITS2-en)
++ [Bert-VITS2-UI](https://github.com/jiangyuxiaoxiao/Bert-VITS2-UI)
+## 感谢所有贡献者作出的努力
+<a href="https://github.com/fishaudio/Bert-VITS2/graphs/contributors" target="_blank">
+  <img src="https://contrib.rocks/image?repo=fishaudio/Bert-VITS2"/>
+</a>
+[//]: # (# 本项目所有代码引用均已写明，bert部分代码思路来源于[AI峰哥]&#40;https://www.bilibili.com/video/BV1w24y1c7z9&#41;，与[vits_chinese]&#40;https://github.com/PlayVoice/vits_chinese&#41;无任何关系。欢迎各位查阅代码。同时，我们也对该开发者的[碰瓷，乃至开盒开发者的行为]&#40;https://www.bilibili.com/read/cv27101514/&#41;表示强烈谴责。)

app.py CHANGED Viewed

@@ -23,11 +23,8 @@ import torch.nn as nn
 from torch.utils.data import Dataset
 from torch.utils.data import DataLoader, Dataset
 from tqdm import tqdm
-from transformers import Wav2Vec2Processor
-from transformers.models.wav2vec2.modeling_wav2vec2 import (
-    Wav2Vec2Model,
-    Wav2Vec2PreTrainedModel,
-)
 import gradio as gr
@@ -37,7 +34,6 @@ from config import config
 import torch
 import commons
 from text import cleaned_text_to_sequence, get_bert
-from emo_gen import process_func, EmotionModel, Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2PreTrainedModel, RegressionHead
 from text.cleaner import clean_text
 import utils
@@ -46,7 +42,7 @@ from text.symbols import symbols
 import sys
 net_g = None
 device = (
         "cuda:0"
         if torch.cuda.is_available()
@@ -56,6 +52,7 @@ device = (
             else "cpu"
         )
     )
 device = "cpu"
 BandList = {
         "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
@@ -73,7 +70,7 @@ BandList = {
         "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
 }
-def get_net_g(model_path: str, version: str, device: str, hps):
     net_g = SynthesizerTrn(
         len(symbols),
         hps.data.filter_length // 2 + 1,
@@ -125,27 +122,6 @@ def get_text(text, language_str, hps, device):
     language = torch.LongTensor(language)
     return bert, ja_bert, en_bert, phone, tone, language
-def get_emo_(reference_audio, emotion):
-    if (emotion == 10 and reference_audio):
-        emo = torch.from_numpy(get_emo(reference_audio))
-    else:
-        emo = torch.Tensor([emotion])
-    return emo
-def get_emo(path):
-    wav, sr = librosa.load(path, 16000)
-    device = config.bert_gen_config.device
-    return process_func(
-        np.expand_dims(wav, 0).astype(np.float64),
-        sr,
-        emotional_model,
-        emotional_processor,
-        device,
-        embeddings=True,
-    ).squeeze(0)
 def infer(
     text,
     sdp_ratio,
@@ -154,16 +130,18 @@ def infer(
     length_scale,
     sid,
     reference_audio=None,
-    emotion=None,
 ):
     language= 'JP' if is_japanese(text) else 'ZH'
-    print(language)
     bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
         text, language, hps, device
     )
-    emo = get_emo_(reference_audio, emotion)
-    print(emo)
     with torch.no_grad():
         x_tst = phones.to(device).unsqueeze(0)
         tones = tones.to(device).unsqueeze(0)
@@ -212,18 +190,14 @@ def loadmodel(model):
     return "success"
 if __name__ == "__main__":
-    emotional_model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
-    REPO_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
-    emotional_processor = Wav2Vec2Processor.from_pretrained(emotional_model_name)
-    emotional_model = EmotionModel.from_pretrained(emotional_model_name).to(device)
     languages = [ "Auto", "ZH", "JP"]
     modelPaths = []
-    for dirpath, dirnames, filenames in os.walk('Data/Bushiroad/models/'):
         for filename in filenames:
             modelPaths.append(os.path.join(dirpath, filename))
-    hps = utils.get_hparams_from_file('Data/Bushiroad/configs/config.json')
     net_g = get_net_g(
-        model_path=modelPaths[-1], version="2.1", device=device, hps=hps
     )
     speaker_ids = hps.data.spk2id
     speakers = list(speaker_ids.keys())
@@ -247,9 +221,12 @@ if __name__ == "__main__":
                                 length_scale = gr.Slider(
                                         minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
                                     )
-                                emotion = gr.Slider(
-                                    minimum=0, maximum=10, value=10, step=1, label="情感控制参数,调至10开启情感参考(建议开启,否则声线压不住),如不启动则设为0"
-                                )
                                 with gr.Accordion(label="参数设定", open=False):
                                     sdp_ratio = gr.Slider(
                                     minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"

 from torch.utils.data import Dataset
 from torch.utils.data import DataLoader, Dataset
 from tqdm import tqdm
+from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
 import gradio as gr
 import torch
 import commons
 from text import cleaned_text_to_sequence, get_bert
 from text.cleaner import clean_text
 import utils
 import sys
 net_g = None
+'''
 device = (
         "cuda:0"
         if torch.cuda.is_available()
             else "cpu"
         )
     )
+'''
 device = "cpu"
 BandList = {
         "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
         "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
 }
+def get_net_g(model_path: str,  device: str, hps):
     net_g = SynthesizerTrn(
         len(symbols),
         hps.data.filter_length // 2 + 1,
     language = torch.LongTensor(language)
     return bert, ja_bert, en_bert, phone, tone, language
 def infer(
     text,
     sdp_ratio,
     length_scale,
     sid,
     reference_audio=None,
+    emotion='Happy',
 ):
     language= 'JP' if is_japanese(text) else 'ZH'
+    if isinstance(reference_audio, np.ndarray):
+        emo = get_clap_audio_feature(reference_audio, device)
+    else:
+        emo = get_clap_text_feature(emotion, device)
+    emo = torch.squeeze(emo, dim=1)
     bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
         text, language, hps, device
     )
     with torch.no_grad():
         x_tst = phones.to(device).unsqueeze(0)
         tones = tones.to(device).unsqueeze(0)
     return "success"
 if __name__ == "__main__":
     languages = [ "Auto", "ZH", "JP"]
     modelPaths = []
+    for dirpath, dirnames, filenames in os.walk('Data/BangDreamV22/models/'):
         for filename in filenames:
             modelPaths.append(os.path.join(dirpath, filename))
+    hps = utils.get_hparams_from_file('Data/BangDreamV22/configs/config.json')
     net_g = get_net_g(
+        model_path=modelPaths[-1], device=device, hps=hps
     )
     speaker_ids = hps.data.spk2id
     speakers = list(speaker_ids.keys())
                                 length_scale = gr.Slider(
                                         minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
                                     )
+                                emotion = gr.Textbox(
+                                        label="Text prompt",
+                                        placeholder="用文字描述生成风格。如：Happy",
+                                        value="Happy",
+                                        visible=True,
+                                    )
                                 with gr.Accordion(label="参数设定", open=False):
                                     sdp_ratio = gr.Slider(
                                     minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"

bert_gen.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import torch
-from multiprocessing import Pool
 import commons
 import utils
-from tqdm import tqdm
-from text import cleaned_text_to_sequence, get_bert
-import argparse
-import torch.multiprocessing as mp
 from config import config
 def process_line(line):
@@ -64,7 +66,7 @@ if __name__ == "__main__":
     with open(hps.data.validation_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
     if len(lines) != 0:
-        num_processes = args.num_processes
         with Pool(processes=num_processes) as pool:
             for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
                 pass

+import argparse
+from multiprocessing import Pool, cpu_count
 import torch
+import torch.multiprocessing as mp
+from tqdm import tqdm
 import commons
 import utils
 from config import config
+from text import cleaned_text_to_sequence, get_bert
 def process_line(line):
     with open(hps.data.validation_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
     if len(lines) != 0:
+        num_processes = min(args.num_processes, cpu_count())
         with Pool(processes=num_processes) as pool:
             for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
                 pass

clap_gen.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import argparse
+from multiprocessing import Pool, cpu_count
+import torch
+import torch.multiprocessing as mp
+from tqdm import tqdm
+import utils
+from config import config
+from clap_wrapper import get_clap_audio_feature
+import librosa
+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["MKL_NUM_THREADS"] = "1"
+def process_line(line):
+    device = config.emo_gen_config.device
+    if config.emo_gen_config.use_multi_device:
+        rank = mp.current_process()._identity
+        rank = rank[0] if len(rank) > 0 else 0
+        if torch.cuda.is_available():
+            gpu_id = rank % torch.cuda.device_count()
+            device = torch.device(f"cuda:{gpu_id}")
+        else:
+            device = torch.device("cpu")
+    wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
+    clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy")
+    if os.path.isfile(clap_path):
+        return
+    audio = librosa.load(wav_path, 48000)[0]
+    # audio = librosa.resample(audio, 44100, 48000)
+    clap = get_clap_audio_feature(audio, device)
+    torch.save(clap, clap_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c", "--config", type=str, default=config.emo_gen_config.config_path
+    )
+    parser.add_argument(
+        "--num_processes", type=int, default=config.emo_gen_config.num_processes
+    )
+    args, _ = parser.parse_known_args()
+    config_path = args.config
+    hps = utils.get_hparams_from_file(config_path)
+    lines = []
+    with open(hps.data.training_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+    with open(hps.data.validation_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+    if len(lines) != 0:
+        num_processes = min(args.num_processes, cpu_count())
+        with Pool(processes=num_processes) as pool:
+            for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
+                pass
+    print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!")

clap_wrapper.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import sys
+import torch
+from transformers import ClapModel, ClapProcessor
+from config import config
+models = dict()
+processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused")
+def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
+            device
+        )
+    with torch.no_grad():
+        inputs = processor(
+            audios=audio_data, return_tensors="pt", sampling_rate=48000
+        ).to(device)
+        emb = models[device].get_audio_features(**inputs)
+    return emb.T
+def get_clap_text_feature(text, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
+            device
+        )
+    with torch.no_grad():
+        inputs = processor(text=text, return_tensors="pt").to(device)
+        emb = models[device].get_text_features(**inputs)
+    return emb.T

commons.py CHANGED Viewed

@@ -46,26 +46,18 @@ def rand_gumbel_like(x):
 def slice_segments(x, ids_str, segment_size=4):
-    ret = torch.zeros_like(x[:, :, :segment_size])
-    for i in range(x.size(0)):
-        idx_str = ids_str[i]
-        idx_end = idx_str + segment_size
-        if idx_str < 0:
-            i1 = x.size(2) + idx_str
-            r1 = x[i, :, i1:]
-            r2 = x[i, :, :idx_end]
-            ret[i] = torch.cat([r1, r2], dim=1)
-        else:
-            ret[i] = x[i, :, idx_str:idx_end]
-    return ret
 def rand_slice_segments(x, x_lengths=None, segment_size=4):
     b, d, t = x.size()
     if x_lengths is None:
         x_lengths = t
-    ids_str_max = x_lengths - segment_size + 1
-    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
     ret = slice_segments(x, ids_str, segment_size)
     return ret, ids_str

 def slice_segments(x, ids_str, segment_size=4):
+    gather_indices = ids_str.view(x.size(0), 1, 1).repeat(
+        1, x.size(1), 1
+    ) + torch.arange(segment_size, device=x.device)
+    return torch.gather(x, 2, gather_indices)
 def rand_slice_segments(x, x_lengths=None, segment_size=4):
     b, d, t = x.size()
     if x_lengths is None:
         x_lengths = t
+    ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
+    ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
     ret = slice_segments(x, ids_str, segment_size)
     return ret, ids_str

compress_model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from collections import OrderedDict
+from text.symbols import symbols
+import torch
+from tools.log import logger
+import utils
+from models import SynthesizerTrn
+import os
+def copyStateDict(state_dict):
+    if list(state_dict.keys())[0].startswith("module"):
+        start_idx = 1
+    else:
+        start_idx = 0
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = ",".join(k.split(".")[start_idx:])
+        new_state_dict[name] = v
+    return new_state_dict
+def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str):
+    hps = utils.get_hparams_from_file(config)
+    net_g = SynthesizerTrn(
+        len(symbols),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        **hps.model,
+    )
+    optim_g = torch.optim.AdamW(
+        net_g.parameters(),
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps,
+    )
+    state_dict_g = torch.load(input_model, map_location="cpu")
+    new_dict_g = copyStateDict(state_dict_g)
+    keys = []
+    for k, v in new_dict_g["model"].items():
+        if "enc_q" in k:
+            continue  # noqa: E701
+        keys.append(k)
+    new_dict_g = (
+        {k: new_dict_g["model"][k].half() for k in keys}
+        if ishalf
+        else {k: new_dict_g["model"][k] for k in keys}
+    )
+    torch.save(
+        {
+            "model": new_dict_g,
+            "iteration": 0,
+            "optimizer": optim_g.state_dict(),
+            "learning_rate": 0.0001,
+        },
+        output_model,
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, default="configs/config.json")
+    parser.add_argument("-i", "--input", type=str)
+    parser.add_argument("-o", "--output", type=str, default=None)
+    parser.add_argument(
+        "-hf", "--half", action="store_true", default=False, help="Save as FP16"
+    )
+    args = parser.parse_args()
+    output = args.output
+    if output is None:
+        import os.path
+        filename, ext = os.path.splitext(args.input)
+        half = "_half" if args.half else ""
+        output = filename + "_release" + half + ext
+    removeOptimizer(args.config, args.input, args.half, output)
+    logger.info(f"压缩模型成功, 输出模型: {os.path.abspath(output)}")

config.py CHANGED Viewed

@@ -38,7 +38,7 @@ class Preprocess_text_config:
         train_path: str,
         val_path: str,
         config_path: str,
-        val_per_spk: int = 5,
         max_val_total: int = 10000,
         clean: bool = True,
     ):
@@ -47,7 +47,7 @@ class Preprocess_text_config:
         self.train_path: str = train_path  # 训练集路径，可以不填。不填则将在原始文本目录生成
         self.val_path: str = val_path  # 验证集路径，可以不填。不填则将在原始文本目录生成
         self.config_path: str = config_path  # 配置文件路径
-        self.val_per_spk: int = val_per_spk  # 每个speaker的验证集条数
         self.max_val_total: int = max_val_total  # 验证集最大条数，多于的会被截断并放到训练集中
         self.clean: bool = clean  # 是否进行数据清洗
@@ -99,10 +99,12 @@ class Emo_gen_config:
         config_path: str,
         num_processes: int = 2,
         device: str = "cuda",
     ):
         self.config_path = config_path
         self.num_processes = num_processes
         self.device = device
     @classmethod
     def from_dict(cls, dataset_path: str, data: Dict[str, any]):
@@ -222,6 +224,9 @@ class Config:
             self.bert_gen_config: Bert_gen_config = Bert_gen_config.from_dict(
                 dataset_path, yaml_config["bert_gen"]
             )
             self.train_ms_config: Train_ms_config = Train_ms_config.from_dict(
                 dataset_path, yaml_config["train_ms"]
             )

         train_path: str,
         val_path: str,
         config_path: str,
+        val_per_lang: int = 5,
         max_val_total: int = 10000,
         clean: bool = True,
     ):
         self.train_path: str = train_path  # 训练集路径，可以不填。不填则将在原始文本目录生成
         self.val_path: str = val_path  # 验证集路径，可以不填。不填则将在原始文本目录生成
         self.config_path: str = config_path  # 配置文件路径
+        self.val_per_lang: int = val_per_lang  # 每个speaker的验证集条数
         self.max_val_total: int = max_val_total  # 验证集最大条数，多于的会被截断并放到训练集中
         self.clean: bool = clean  # 是否进行数据清洗
         config_path: str,
         num_processes: int = 2,
         device: str = "cuda",
+        use_multi_device: bool = False,
     ):
         self.config_path = config_path
         self.num_processes = num_processes
         self.device = device
+        self.use_multi_device = use_multi_device
     @classmethod
     def from_dict(cls, dataset_path: str, data: Dict[str, any]):
             self.bert_gen_config: Bert_gen_config = Bert_gen_config.from_dict(
                 dataset_path, yaml_config["bert_gen"]
             )
+            self.emo_gen_config: Emo_gen_config = Emo_gen_config.from_dict(
+                dataset_path, yaml_config["emo_gen"]
+            )
             self.train_ms_config: Train_ms_config = Train_ms_config.from_dict(
                 dataset_path, yaml_config["train_ms"]
             )

config.yml CHANGED Viewed

@@ -4,10 +4,10 @@
 # 拟提供通用路径配置，统一存放数据，避免数据放得很乱
 # 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
 # 不填或者填空则路径为相对于项目根目录的路径
-dataset_path: "Data/BanGDream"
 # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
-mirror: "openai"
 openi_token: ""  # openi token
 # resample 音频重采样配置
@@ -17,16 +17,16 @@ resample:
   sampling_rate: 44100
   # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
   # 请填入相对于datasetPath的相对路径
-  in_dir: "" # 相对于根目录的路径为 /datasetPath/in_dir
   # 音频文件重采样后输出路径
-  out_dir: ""
 # preprocess_text 数据集预处理相关配置
 # 注意， “:” 后需要加空格
 preprocess_text:
   # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
-  transcription_path: "filelists/Mygo.list"
   # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
   cleaned_path: ""
   # 训练集路径
@@ -34,11 +34,11 @@ preprocess_text:
   # 验证集路径
   val_path: "filelists/val.list"
   # 配置文件路径
-  config_path: "configs/config.json"
-  # 每个speaker的验证集条数
-  val_per_spk: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
-  max_val_total: 8
   # 是否进行数据清洗
   clean: true
@@ -47,9 +47,9 @@ preprocess_text:
 # 注意， “:” 后需要加空格
 bert_gen:
   # 训练数据集配置文件路径
-  config_path: "configs/config.json"
   # 并行数
-  num_processes: 2
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   # 该选项同时决定了get_bert_feature的默认设备
   device: "cuda"
@@ -60,11 +60,13 @@ bert_gen:
 # 注意， “:” 后需要加空格
 emo_gen:
   # 训练数据集配置文件路径
-  config_path: "configs/config.json"
   # 并行数
-  num_processes: 2
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   device: "cuda"
 # train 训练配置
 # 注意， “:” 后需要加空格
@@ -79,13 +81,13 @@ train_ms:
     # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
   # 底模设置
   base:
-    use_base_model: True
     repo_id: "Stardust_minus/Bert-VITS2"
-    model_image: "Bert-VITS2_2.1-Emo底模" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
-  config_path: "configs/config.json"
   # 训练使用的worker，不建议超过CPU核心数
   num_workers: 16
   # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
@@ -98,11 +100,11 @@ train_ms:
 # 注意， “:” 后需要加空格
 webui:
   # 推理设备
-  device: "cpu"
   # 模型路径
-  model: "models/G_30000.pth"
   # 配置文件路径
-  config_path: "configs/config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
@@ -113,7 +115,7 @@ webui:
   language_identification_library: "langid"
-# server api配置
 # 注意， “:” 后需要加空格
 # 注意，本配置下的所有配置均为相对于根目录的路径
 server:
@@ -121,8 +123,10 @@ server:
   port: 5000
   # 模型默认使用设备：但是当前并没有实现这个配置。
   device: "cuda"
-  # 需要加载的所有模型的配置
   # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
   models:
     - # 模型的路径
       model: ""
@@ -163,7 +167,6 @@ server:
       # 不必填写所有人物，不填的使用默认值
       speakers: [ ] # 也可以不填
 # 百度翻译开放平台 api配置
 # api接入文档 https://api.fanyi.baidu.com/doc/21
 # 请不要在github等网站公开分享你的app id 与 key

 # 拟提供通用路径配置，统一存放数据，避免数据放得很乱
 # 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
 # 不填或者填空则路径为相对于项目根目录的路径
+dataset_path: "Data/"
 # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
+mirror: ""
 openi_token: ""  # openi token
 # resample 音频重采样配置
   sampling_rate: 44100
   # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
   # 请填入相对于datasetPath的相对路径
+  in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
   # 音频文件重采样后输出路径
+  out_dir: "audios/wavs"
 # preprocess_text 数据集预处理相关配置
 # 注意， “:” 后需要加空格
 preprocess_text:
   # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
+  transcription_path: "filelists/你的数据集文本.list"
   # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
   cleaned_path: ""
   # 训练集路径
   # 验证集路径
   val_path: "filelists/val.list"
   # 配置文件路径
+  config_path: "config.json"
+  # 每个语言的验证集条数
+  val_per_lang: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
+  max_val_total: 12
   # 是否进行数据清洗
   clean: true
 # 注意， “:” 后需要加空格
 bert_gen:
   # 训练数据集配置文件路径
+  config_path: "config.json"
   # 并行数
+  num_processes: 4
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   # 该选项同时决定了get_bert_feature的默认设备
   device: "cuda"
 # 注意， “:” 后需要加空格
 emo_gen:
   # 训练数据集配置文件路径
+  config_path: "config.json"
   # 并行数
+  num_processes: 4
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   device: "cuda"
+  # 使用多卡推理
+  use_multi_device: false
 # train 训练配置
 # 注意， “:” 后需要加空格
     # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
   # 底模设置
   base:
+    use_base_model: false
     repo_id: "Stardust_minus/Bert-VITS2"
+    model_image: "Bert-VITS2_2.2-Clap底模" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
+  config_path: "config.json"
   # 训练使用的worker，不建议超过CPU核心数
   num_workers: 16
   # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
 # 注意， “:” 后需要加空格
 webui:
   # 推理设备
+  device: "cuda"
   # 模型路径
+  model: "models/G_8000.pth"
   # 配置文件路径
+  config_path: "config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
   language_identification_library: "langid"
+# server-fastapi配置
 # 注意， “:” 后需要加空格
 # 注意，本配置下的所有配置均为相对于根目录的路径
 server:
   port: 5000
   # 模型默认使用设备：但是当前并没有实现这个配置。
   device: "cuda"
+  # 需要加载的所有模型的配置，可以填多个模型，也可以不填模型，等网页成功后手动加载模型
+  # 不加载模型的配置格式：删除默认给的两个模型配置，给models赋值 [ ]，也就是空列表。参考模型2的speakers 即 models: [ ]
   # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
+  # 也可以不填模型，等网页加载成功后手动填写models。
   models:
     - # 模型的路径
       model: ""
       # 不必填写所有人物，不填的使用默认值
       speakers: [ ] # 也可以不填
 # 百度翻译开放平台 api配置
 # api接入文档 https://api.fanyi.baidu.com/doc/21
 # 请不要在github等网站公开分享你的app id 与 key

configs/config.json CHANGED Viewed

@@ -10,7 +10,7 @@
       0.99
     ],
     "eps": 1e-09,
-    "batch_size": 24,
     "fp16_run": false,
     "lr_decay": 0.99995,
     "segment_size": 16384,
@@ -18,7 +18,10 @@
     "warmup_epochs": 0,
     "c_mel": 45,
     "c_kl": 1.0,
-    "skip_optimizer": true
   },
   "data": {
     "training_files": "filelists/train.list",
@@ -676,220 +679,220 @@
       "埃舍尔_EN": 638,
       "萨齐因_EN": 639,
       "古田_EN": 640,
-      "陆景和": 641,
-      "莫弈": 642,
-      "左然": 643,
-      "夏彦": 644,
-      "三月七_ZH": 645,
-      "丹恒_ZH": 646,
-      "希儿_ZH": 647,
-      "娜塔莎_ZH": 648,
-      "希露瓦_ZH": 649,
-      "瓦尔特_ZH": 650,
-      "佩拉_ZH": 651,
-      "布洛妮娅_ZH": 652,
-      "虎克_ZH": 653,
-      "素裳_ZH": 654,
-      "克拉拉_ZH": 655,
-      "符玄_ZH": 656,
-      "白露_ZH": 657,
-      "杰帕德_ZH": 658,
-      "景元_ZH": 659,
-      "藿藿_ZH": 660,
-      "姬子_ZH": 661,
-      "穹_ZH": 662,
-      "星_ZH": 663,
-      "卡芙卡_ZH": 664,
-      "桂乃芬_ZH": 665,
-      "艾丝妲_ZH": 666,
-      "玲可_ZH": 667,
-      "彦卿_ZH": 668,
-      "托帕_ZH": 669,
-      "驭空_ZH": 670,
-      "浮烟_ZH": 671,
-      "停云_ZH": 672,
-      "镜流_ZH": 673,
-      "罗刹_ZH": 674,
-      "卢卡_ZH": 675,
-      "史瓦罗_ZH": 676,
-      "黑塔_ZH": 677,
-      "桑博_ZH": 678,
-      "伦纳德_ZH": 679,
-      "明曦_ZH": 680,
-      "银狼_ZH": 681,
-      "帕姆_ZH": 682,
-      "青雀_ZH": 683,
-      "乔瓦尼_ZH": 684,
-      "公输师傅_ZH": 685,
-      "晴霓_ZH": 686,
-      "螺丝咕姆_ZH": 687,
-      "阿兰_ZH": 688,
-      "奥列格_ZH": 689,
-      "丹枢_ZH": 690,
-      "尾巴_ZH": 691,
-      "寒鸦_ZH": 692,
-      "雪衣_ZH": 693,
-      "可可利亚_ZH": 694,
-      "青镞_ZH": 695,
-      "半夏_ZH": 696,
-      "银枝_ZH": 697,
-      "大毫_ZH": 698,
-      "霄翰_ZH": 699,
-      "信使_ZH": 700,
-      "费斯曼_ZH": 701,
-      "绿芙蓉_ZH": 702,
-      "dev_成男_ZH": 703,
-      "金人会长_ZH": 704,
-      "维利特_ZH": 705,
-      "维尔德_ZH": 706,
-      "斯科特_ZH": 707,
-      "卡波特_ZH": 708,
-      "刃_ZH": 709,
-      "岩明_ZH": 710,
-      "浣溪_ZH": 711,
-      "三月七_JP": 712,
-      "丹恒_JP": 713,
-      "希儿_JP": 714,
-      "娜塔莎_JP": 715,
-      "希露瓦_JP": 716,
-      "瓦尔特_JP": 717,
-      "佩拉_JP": 718,
-      "布洛妮娅_JP": 719,
-      "虎克_JP": 720,
-      "素裳_JP": 721,
-      "克拉拉_JP": 722,
-      "符玄_JP": 723,
-      "白露_JP": 724,
-      "杰帕德_JP": 725,
-      "景元_JP": 726,
-      "藿藿_JP": 727,
-      "姬子_JP": 728,
-      "卡芙卡_JP": 729,
-      "穹_JP": 730,
-      "星_JP": 731,
-      "桂乃芬_JP": 732,
-      "艾丝妲_JP": 733,
-      "彦卿_JP": 734,
-      "玲可_JP": 735,
-      "托帕_JP": 736,
-      "驭空_JP": 737,
-      "浮烟_JP": 738,
-      "停云_JP": 739,
-      "镜流_JP": 740,
-      "罗刹_JP": 741,
-      "卢卡_JP": 742,
-      "史瓦罗_JP": 743,
-      "黑塔_JP": 744,
-      "桑博_JP": 745,
-      "伦纳德_JP": 746,
-      "明曦_JP": 747,
-      "银狼_JP": 748,
-      "帕姆_JP": 749,
-      "青雀_JP": 750,
-      "乔瓦尼_JP": 751,
-      "公输师傅_JP": 752,
-      "晴霓_JP": 753,
-      "螺丝咕姆_JP": 754,
-      "阿兰_JP": 755,
-      "奥列格_JP": 756,
-      "丹枢_JP": 757,
-      "尾巴_JP": 758,
-      "寒鸦_JP": 759,
-      "雪衣_JP": 760,
-      "可可利亚_JP": 761,
-      "青镞_JP": 762,
-      "半夏_JP": 763,
-      "银枝_JP": 764,
-      "大毫_JP": 765,
-      "霄翰_JP": 766,
-      "信使_JP": 767,
-      "费斯曼_JP": 768,
-      "绿芙蓉_JP": 769,
-      "dev_成男_JP": 770,
-      "金人会长_JP": 771,
-      "维利特_JP": 772,
-      "维尔德_JP": 773,
-      "斯科特_JP": 774,
-      "刃_JP": 775,
-      "卡波特_JP": 776,
-      "岩明_JP": 777,
-      "浣溪_JP": 778,
-      "净砚_JP": 779,
-      "紫月季_JP": 780,
-      "歌蒂_JP": 781,
-      "奇怪的云骑_JP": 782,
-      "幻胧_JP": 783,
-      "斯薇塔_JP": 784,
-      "隐书_JP": 785,
-      "三月七_EN": 786,
-      "丹恒_EN": 787,
-      "希儿_EN": 788,
-      "娜塔莎_EN": 789,
-      "希露瓦_EN": 790,
-      "瓦尔特_EN": 791,
-      "佩拉_EN": 792,
-      "布洛妮娅_EN": 793,
-      "虎克_EN": 794,
-      "素裳_EN": 795,
-      "克拉拉_EN": 796,
-      "符玄_EN": 797,
-      "白露_EN": 798,
-      "杰帕德_EN": 799,
-      "景元_EN": 800,
-      "藿藿_EN": 801,
-      "姬子_EN": 802,
-      "卡芙卡_EN": 803,
-      "穹_EN": 804,
-      "星_EN": 805,
-      "桂乃芬_EN": 806,
-      "艾丝妲_EN": 807,
-      "彦卿_EN": 808,
-      "玲可_EN": 809,
-      "托帕_EN": 810,
-      "驭空_EN": 811,
-      "浮烟_EN": 812,
-      "停云_EN": 813,
-      "镜流_EN": 814,
-      "罗刹_EN": 815,
-      "卢卡_EN": 816,
-      "史瓦罗_EN": 817,
-      "黑塔_EN": 818,
-      "桑博_EN": 819,
-      "伦纳德_EN": 820,
-      "明曦_EN": 821,
-      "银狼_EN": 822,
-      "帕姆_EN": 823,
-      "青雀_EN": 824,
-      "乔瓦尼_EN": 825,
-      "公输师傅_EN": 826,
-      "晴霓_EN": 827,
-      "螺丝咕姆_EN": 828,
-      "阿兰_EN": 829,
-      "奥列格_EN": 830,
-      "丹枢_EN": 831,
-      "尾巴_EN": 832,
-      "寒鸦_EN": 833,
-      "雪衣_EN": 834,
-      "可可利亚_EN": 835,
-      "青镞_EN": 836,
-      "半夏_EN": 837,
-      "银枝_EN": 838,
-      "大毫_EN": 839,
-      "霄翰_EN": 840,
-      "信使_EN": 841,
-      "费斯曼_EN": 842,
-      "绿芙蓉_EN": 843,
-      "dev_成男_EN": 844,
-      "金人会长_EN": 845,
-      "维利特_EN": 846,
-      "维尔德_EN": 847,
-      "刃_EN": 848,
-      "卡波特_EN": 849,
-      "岩明_EN": 850,
-      "浣溪_EN": 851,
-      "紫月季_EN": 852,
-      "幻胧_EN": 853,
-      "女声_EN": 854
     }
   },
   "model": {
@@ -946,5 +949,5 @@
     "use_spectral_norm": false,
     "gin_channels": 256
   },
-  "version": "2.1"
 }

       0.99
     ],
     "eps": 1e-09,
+    "batch_size": 12,
     "fp16_run": false,
     "lr_decay": 0.99995,
     "segment_size": 16384,
     "warmup_epochs": 0,
     "c_mel": 45,
     "c_kl": 1.0,
+    "skip_optimizer": true,
+    "freeze_ZH_bert": false,
+    "freeze_JP_bert": false,
+    "freeze_EN_bert": false
   },
   "data": {
     "training_files": "filelists/train.list",
       "埃舍尔_EN": 638,
       "萨齐因_EN": 639,
       "古田_EN": 640,
+      "三月七_ZH": 641,
+      "丹恒_ZH": 642,
+      "希儿_ZH": 643,
+      "娜塔莎_ZH": 644,
+      "希露瓦_ZH": 645,
+      "瓦尔特_ZH": 646,
+      "佩拉_ZH": 647,
+      "布洛妮娅_ZH": 648,
+      "虎克_ZH": 649,
+      "素裳_ZH": 650,
+      "克拉拉_ZH": 651,
+      "符玄_ZH": 652,
+      "白露_ZH": 653,
+      "杰帕德_ZH": 654,
+      "景元_ZH": 655,
+      "藿藿_ZH": 656,
+      "姬子_ZH": 657,
+      "穹_ZH": 658,
+      "星_ZH": 659,
+      "卡芙卡_ZH": 660,
+      "桂乃芬_ZH": 661,
+      "艾丝妲_ZH": 662,
+      "玲可_ZH": 663,
+      "彦卿_ZH": 664,
+      "托帕_ZH": 665,
+      "驭空_ZH": 666,
+      "浮烟_ZH": 667,
+      "停云_ZH": 668,
+      "镜流_ZH": 669,
+      "罗刹_ZH": 670,
+      "卢卡_ZH": 671,
+      "史瓦罗_ZH": 672,
+      "黑塔_ZH": 673,
+      "桑博_ZH": 674,
+      "伦纳德_ZH": 675,
+      "明曦_ZH": 676,
+      "银狼_ZH": 677,
+      "帕姆_ZH": 678,
+      "青雀_ZH": 679,
+      "乔瓦尼_ZH": 680,
+      "公输师傅_ZH": 681,
+      "晴霓_ZH": 682,
+      "螺丝咕姆_ZH": 683,
+      "阿兰_ZH": 684,
+      "奥列格_ZH": 685,
+      "丹枢_ZH": 686,
+      "尾巴_ZH": 687,
+      "寒鸦_ZH": 688,
+      "雪衣_ZH": 689,
+      "可可利亚_ZH": 690,
+      "青镞_ZH": 691,
+      "半夏_ZH": 692,
+      "银枝_ZH": 693,
+      "大毫_ZH": 694,
+      "霄翰_ZH": 695,
+      "信使_ZH": 696,
+      "费斯曼_ZH": 697,
+      "绿芙蓉_ZH": 698,
+      "dev_成男_ZH": 699,
+      "金人会长_ZH": 700,
+      "维利特_ZH": 701,
+      "维尔德_ZH": 702,
+      "斯科特_ZH": 703,
+      "卡波特_ZH": 704,
+      "刃_ZH": 705,
+      "岩明_ZH": 706,
+      "浣溪_ZH": 707,
+      "三月七_JP": 708,
+      "丹恒_JP": 709,
+      "希儿_JP": 710,
+      "娜塔莎_JP": 711,
+      "希露瓦_JP": 712,
+      "瓦尔特_JP": 713,
+      "佩拉_JP": 714,
+      "布洛妮娅_JP": 715,
+      "虎克_JP": 716,
+      "素裳_JP": 717,
+      "克拉拉_JP": 718,
+      "符玄_JP": 719,
+      "白露_JP": 720,
+      "杰帕德_JP": 721,
+      "景元_JP": 722,
+      "藿藿_JP": 723,
+      "姬子_JP": 724,
+      "卡芙卡_JP": 725,
+      "穹_JP": 726,
+      "星_JP": 727,
+      "桂乃芬_JP": 728,
+      "艾丝妲_JP": 729,
+      "彦卿_JP": 730,
+      "玲可_JP": 731,
+      "托帕_JP": 732,
+      "驭空_JP": 733,
+      "浮烟_JP": 734,
+      "停云_JP": 735,
+      "镜流_JP": 736,
+      "罗刹_JP": 737,
+      "卢卡_JP": 738,
+      "史瓦罗_JP": 739,
+      "黑塔_JP": 740,
+      "桑博_JP": 741,
+      "伦纳德_JP": 742,
+      "明曦_JP": 743,
+      "银狼_JP": 744,
+      "帕姆_JP": 745,
+      "青雀_JP": 746,
+      "乔瓦尼_JP": 747,
+      "公输师傅_JP": 748,
+      "晴霓_JP": 749,
+      "螺丝咕姆_JP": 750,
+      "阿兰_JP": 751,
+      "奥列格_JP": 752,
+      "丹枢_JP": 753,
+      "尾巴_JP": 754,
+      "寒鸦_JP": 755,
+      "雪衣_JP": 756,
+      "可可利亚_JP": 757,
+      "青镞_JP": 758,
+      "半夏_JP": 759,
+      "银枝_JP": 760,
+      "大毫_JP": 761,
+      "霄翰_JP": 762,
+      "信使_JP": 763,
+      "费斯曼_JP": 764,
+      "绿芙蓉_JP": 765,
+      "dev_成男_JP": 766,
+      "金人会长_JP": 767,
+      "维利特_JP": 768,
+      "维尔德_JP": 769,
+      "斯科特_JP": 770,
+      "刃_JP": 771,
+      "卡波特_JP": 772,
+      "岩明_JP": 773,
+      "浣溪_JP": 774,
+      "净砚_JP": 775,
+      "紫月季_JP": 776,
+      "歌蒂_JP": 777,
+      "奇怪的云骑_JP": 778,
+      "幻胧_JP": 779,
+      "斯薇塔_JP": 780,
+      "隐书_JP": 781,
+      "三月七_EN": 782,
+      "丹恒_EN": 783,
+      "希儿_EN": 784,
+      "娜塔莎_EN": 785,
+      "希露瓦_EN": 786,
+      "瓦尔特_EN": 787,
+      "佩拉_EN": 788,
+      "布洛妮娅_EN": 789,
+      "虎克_EN": 790,
+      "素裳_EN": 791,
+      "克拉拉_EN": 792,
+      "符玄_EN": 793,
+      "白露_EN": 794,
+      "杰帕德_EN": 795,
+      "景元_EN": 796,
+      "藿藿_EN": 797,
+      "姬子_EN": 798,
+      "卡芙卡_EN": 799,
+      "穹_EN": 800,
+      "星_EN": 801,
+      "桂乃芬_EN": 802,
+      "艾丝妲_EN": 803,
+      "彦卿_EN": 804,
+      "玲可_EN": 805,
+      "托帕_EN": 806,
+      "驭空_EN": 807,
+      "浮烟_EN": 808,
+      "停云_EN": 809,
+      "镜流_EN": 810,
+      "罗刹_EN": 811,
+      "卢卡_EN": 812,
+      "史瓦罗_EN": 813,
+      "黑塔_EN": 814,
+      "桑博_EN": 815,
+      "伦纳德_EN": 816,
+      "明曦_EN": 817,
+      "银狼_EN": 818,
+      "帕姆_EN": 819,
+      "青雀_EN": 820,
+      "乔瓦尼_EN": 821,
+      "公输师傅_EN": 822,
+      "晴霓_EN": 823,
+      "螺丝咕姆_EN": 824,
+      "阿兰_EN": 825,
+      "奥列格_EN": 826,
+      "丹枢_EN": 827,
+      "尾巴_EN": 828,
+      "寒鸦_EN": 829,
+      "雪衣_EN": 830,
+      "可可利亚_EN": 831,
+      "青镞_EN": 832,
+      "半夏_EN": 833,
+      "银枝_EN": 834,
+      "大毫_EN": 835,
+      "霄翰_EN": 836,
+      "信使_EN": 837,
+      "费斯曼_EN": 838,
+      "绿芙蓉_EN": 839,
+      "dev_成男_EN": 840,
+      "金人会长_EN": 841,
+      "维利特_EN": 842,
+      "维尔德_EN": 843,
+      "刃_EN": 844,
+      "卡波特_EN": 845,
+      "岩明_EN": 846,
+      "浣溪_EN": 847,
+      "紫月季_EN": 848,
+      "幻胧_EN": 849,
+      "女声_EN": 850,
+      "陆景和": 851,
+      "莫弈": 852,
+      "左然": 853,
+      "夏彦": 854
     }
   },
   "model": {
     "use_spectral_norm": false,
     "gin_channels": 256
   },
+  "version": "2.2"
 }

css/custom.css ADDED Viewed

	@@ -0,0 +1,18 @@

+#yml_code {
+    height: 600px;
+    flex-grow: inherit;
+    overflow-y: auto;
+}
+#json_code {
+    height: 600px;
+    flex-grow: inherit;
+    overflow-y: auto;
+}
+#gpu_code {
+    height: 300px;
+    flex-grow: inherit;
+    overflow-y: auto;
+}

data_utils.py CHANGED Viewed

@@ -44,6 +44,10 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         self.min_text_len = getattr(hparams, "min_text_len", 1)
         self.max_text_len = getattr(hparams, "max_text_len", 384)
         random.seed(1234)
         random.shuffle(self.audiopaths_sid_text)
         self._filter()
@@ -93,7 +97,14 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         spec, wav = self.get_audio(audiopath)
         sid = torch.LongTensor([int(self.spk_map[sid])])
-        emo = torch.FloatTensor(np.load(audiopath.replace(".wav", ".emo.npy")))
         return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo)
     def get_audio(self, filename):
@@ -157,15 +168,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         if language_str == "ZH":
             bert = bert_ori
-            ja_bert = torch.zeros(1024, len(phone))
-            en_bert = torch.zeros(1024, len(phone))
         elif language_str == "JP":
-            bert = torch.zeros(1024, len(phone))
             ja_bert = bert_ori
-            en_bert = torch.zeros(1024, len(phone))
         elif language_str == "EN":
-            bert = torch.zeros(1024, len(phone))
-            ja_bert = torch.zeros(1024, len(phone))
             en_bert = bert_ori
         phone = torch.LongTensor(phone)
         tone = torch.LongTensor(tone)
@@ -215,7 +226,7 @@ class TextAudioSpeakerCollate:
         bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
-        emo = torch.FloatTensor(len(batch), 1024)
         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)

         self.min_text_len = getattr(hparams, "min_text_len", 1)
         self.max_text_len = getattr(hparams, "max_text_len", 384)
+        self.empty_emo = torch.squeeze(
+            torch.load("empty_emo.npy", map_location="cpu"), dim=1
+        )
         random.seed(1234)
         random.shuffle(self.audiopaths_sid_text)
         self._filter()
         spec, wav = self.get_audio(audiopath)
         sid = torch.LongTensor([int(self.spk_map[sid])])
+        if np.random.rand() > 0.1:
+            emo = torch.squeeze(
+                torch.load(audiopath.replace(".wav", ".emo.npy"), map_location="cpu"),
+                dim=1,
+            )
+        else:
+            emo = self.empty_emo
         return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo)
     def get_audio(self, filename):
         if language_str == "ZH":
             bert = bert_ori
+            ja_bert = torch.rand(1024, len(phone))
+            en_bert = torch.rand(1024, len(phone))
         elif language_str == "JP":
+            bert = torch.rand(1024, len(phone))
             ja_bert = bert_ori
+            en_bert = torch.rand(1024, len(phone))
         elif language_str == "EN":
+            bert = torch.rand(1024, len(phone))
+            ja_bert = torch.rand(1024, len(phone))
             en_bert = bert_ori
         phone = torch.LongTensor(phone)
         tone = torch.LongTensor(tone)
         bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
+        emo = torch.FloatTensor(len(batch), 512)
         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)

default_config.yml CHANGED Viewed

@@ -35,10 +35,10 @@ preprocess_text:
   val_path: "filelists/val.list"
   # 配置文件路径
   config_path: "config.json"
-  # 每个speaker的验证集条数
-  val_per_spk: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
-  max_val_total: 8
   # 是否进行数据清洗
   clean: true
@@ -49,7 +49,7 @@ bert_gen:
   # 训练数据集配置文件路径
   config_path: "config.json"
   # 并行数
-  num_processes: 2
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   # 该选项同时决定了get_bert_feature的默认设备
   device: "cuda"
@@ -62,9 +62,11 @@ emo_gen:
   # 训练数据集配置文件路径
   config_path: "config.json"
   # 并行数
-  num_processes: 2
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   device: "cuda"
 # train 训练配置
 # 注意， “:” 后需要加空格
@@ -81,11 +83,11 @@ train_ms:
   base:
     use_base_model: false
     repo_id: "Stardust_minus/Bert-VITS2"
-    model_image: "Bert-VITS2_2.1-Emo底模" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
-  config_path: "configs/config.json"
   # 训练使用的worker，不建议超过CPU核心数
   num_workers: 16
   # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
@@ -100,9 +102,9 @@ webui:
   # 推理设备
   device: "cuda"
   # 模型路径
-  model: "genshin/models/G_8000.pth"
   # 配置文件路径
-  config_path: "configs/config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
@@ -113,7 +115,7 @@ webui:
   language_identification_library: "langid"
-# server api配置
 # 注意， “:” 后需要加空格
 # 注意，本配置下的所有配置均为相对于根目录的路径
 server:
@@ -121,8 +123,10 @@ server:
   port: 5000
   # 模型默认使用设备：但是当前并没有实现这个配置。
   device: "cuda"
-  # 需要加载的所有模型的配置
   # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
   models:
     - # 模型的路径
       model: ""
@@ -163,7 +167,6 @@ server:
       # 不必填写所有人物，不填的使用默认值
       speakers: [ ] # 也可以不填
 # 百度翻译开放平台 api配置
 # api接入文档 https://api.fanyi.baidu.com/doc/21
 # 请不要在github等网站公开分享你的app id 与 key

   val_path: "filelists/val.list"
   # 配置文件路径
   config_path: "config.json"
+  # 每个语言的验证集条数
+  val_per_lang: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
+  max_val_total: 12
   # 是否进行数据清洗
   clean: true
   # 训练数据集配置文件路径
   config_path: "config.json"
   # 并行数
+  num_processes: 4
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   # 该选项同时决定了get_bert_feature的默认设备
   device: "cuda"
   # 训练数据集配置文件路径
   config_path: "config.json"
   # 并行数
+  num_processes: 4
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   device: "cuda"
+  # 使用多卡推理
+  use_multi_device: false
 # train 训练配置
 # 注意， “:” 后需要加空格
   base:
     use_base_model: false
     repo_id: "Stardust_minus/Bert-VITS2"
+    model_image: "Bert-VITS2_2.2-Clap底模" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
+  config_path: "config.json"
   # 训练使用的worker，不建议超过CPU核心数
   num_workers: 16
   # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
   # 推理设备
   device: "cuda"
   # 模型路径
+  model: "models/G_8000.pth"
   # 配置文件路径
+  config_path: "config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
   language_identification_library: "langid"
+# server-fastapi配置
 # 注意， “:” 后需要加空格
 # 注意，本配置下的所有配置均为相对于根目录的路径
 server:
   port: 5000
   # 模型默认使用设备：但是当前并没有实现这个配置。
   device: "cuda"
+  # 需要加载的所有模型的配置，可以填多个模型，也可以不填模型，等网页成功后手动加载模型
+  # 不加载模型的配置格式：删除默认给的两个模型配置，给models赋值 [ ]，也就是空列表。参考模型2的speakers 即 models: [ ]
   # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
+  # 也可以不填模型，等网页加载成功后手动填写models。
   models:
     - # 模型的路径
       model: ""
       # 不必填写所有人物，不填的使用默认值
       speakers: [ ] # 也可以不填
 # 百度翻译开放平台 api配置
 # api接入文档 https://api.fanyi.baidu.com/doc/21
 # 请不要在github等网站公开分享你的app id 与 key

emotional/clap-htsat-fused/.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

emotional/clap-htsat-fused/README.md ADDED Viewed

	@@ -0,0 +1,107 @@

+---
+license: apache-2.0
+---
+# Model card for CLAP
+Model card for CLAP: Contrastive Language-Audio Pretraining
+![clap_image](https://s3.amazonaws.com/moonup/production/uploads/1678811100805-62441d1d9fdefb55a0b7d12c.png)
+#  Table of Contents
+0. [TL;DR](#TL;DR)
+1. [Model Details](#model-details)
+2. [Usage](#usage)
+3. [Uses](#uses)
+4. [Citation](#citation)
+# TL;DR
+The abstract of the paper states that:
+> Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zero-shot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-630K and the proposed model are both available to the public.
+# Usage
+You can use this model for zero shot audio classification or extracting audio and/or textual features.
+# Uses
+## Perform zero-shot audio classification
+### Using `pipeline`
+```python
+from datasets import load_dataset
+from transformers import pipeline
+dataset = load_dataset("ashraq/esc50")
+audio = dataset["train"]["audio"][-1]["array"]
+audio_classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-fused")
+output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+print(output)
+>>> [{"score": 0.999, "label": "Sound of a dog"}, {"score": 0.001, "label": "Sound of vaccum cleaner"}]
+```
+## Run the model:
+You can also get the audio and text embeddings using `ClapModel`
+### Run the model on CPU:
+```python
+from datasets import load_dataset
+from transformers import ClapModel, ClapProcessor
+librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+audio_sample = librispeech_dummy[0]
+model = ClapModel.from_pretrained("laion/clap-htsat-fused")
+processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
+inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt")
+audio_embed = model.get_audio_features(**inputs)
+```
+### Run the model on GPU:
+```python
+from datasets import load_dataset
+from transformers import ClapModel, ClapProcessor
+librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+audio_sample = librispeech_dummy[0]
+model = ClapModel.from_pretrained("laion/clap-htsat-fused").to(0)
+processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
+inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt").to(0)
+audio_embed = model.get_audio_features(**inputs)
+```
+# Citation
+If you are using this model for your work, please consider citing the original paper:
+```
+@misc{https://doi.org/10.48550/arxiv.2211.06687,
+  doi = {10.48550/ARXIV.2211.06687},
+  url = {https://arxiv.org/abs/2211.06687},
+  author = {Wu, Yusong and Chen, Ke and Zhang, Tianyu and Hui, Yuchen and Berg-Kirkpatrick, Taylor and Dubnov, Shlomo},
+  keywords = {Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
+  title = {Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {Creative Commons Attribution 4.0 International}
+}
+```

emotional/clap-htsat-fused/config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "_commit_hash": null,
+  "architectures": [
+    "ClapModel"
+  ],
+  "audio_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "aff_block_r": 4,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depths": [
+      2,
+      2,
+      6,
+      2
+    ],
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "drop_path_rate": 0.0,
+    "early_stopping": false,
+    "enable_fusion": true,
+    "enable_patch_fusion": true,
+    "enable_patch_layer_norm": true,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "flatten_patch_embeds": true,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "fusion_num_hidden_layers": 2,
+    "fusion_type": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "mlp_ratio": 4.0,
+    "model_type": "clap_audio_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": [
+      4,
+      8,
+      16,
+      32
+    ],
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_classes": 527,
+    "num_hidden_layers": 4,
+    "num_mel_bins": 64,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_embed_input_channels": 1,
+    "patch_embeds_hidden_size": 96,
+    "patch_size": 4,
+    "patch_stride": [
+      4,
+      4
+    ],
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "projection_hidden_act": "relu",
+    "projection_hidden_size": 768,
+    "pruned_heads": {},
+    "qkv_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spec_size": 256,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.27.0.dev0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "window_size": 8
+  },
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 14.285714285714285,
+  "model_type": "clap",
+  "num_hidden_layers": 16,
+  "projection_dim": 512,
+  "projection_hidden_act": "relu",
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": null,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "fusion_hidden_size": 768,
+    "fusion_num_hidden_layers": 2,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 514,
+    "min_length": 0,
+    "model_type": "clap_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "position_embedding_type": "absolute",
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "projection_hidden_act": "relu",
+    "projection_hidden_size": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.27.0.dev0",
+    "type_vocab_size": 1,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 50265
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null
+}

emotional/clap-htsat-fused/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

emotional/clap-htsat-fused/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "chunk_length_s": 10,
+  "feature_extractor_type": "ClapFeatureExtractor",
+  "feature_size": 64,
+  "fft_window_size": 1024,
+  "frequency_max": 14000,
+  "frequency_min": 50,
+  "hop_length": 480,
+  "max_length_s": 10,
+  "n_fft": 1024,
+  "nb_frequency_bins": 513,
+  "nb_max_frames": 1000,
+  "nb_max_samples": 480000,
+  "padding": "repeatpad",
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "ClapProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 48000,
+  "top_db": null,
+  "truncation": "fusion"
+}

emotional/clap-htsat-fused/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ed5d0215d887551ddd0a49ce7311b21429ebdf1e6a129d4e68f743357225253
+size 614596545

emotional/clap-htsat-fused/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

emotional/clap-htsat-fused/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

emotional/clap-htsat-fused/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "processor_class": "ClapProcessor",
+  "sep_token": "</s>",
+  "special_tokens_map_file": null,
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

emotional/clap-htsat-fused/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

empty_emo.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07063411ab7d6e7aacfc73c582616c3fbc8fdf518b20d42d8be77bc9caf6fab9
+size 3238

export_onnx.py CHANGED Viewed

@@ -1,54 +1,10 @@
-from models_onnx import SynthesizerTrn
-import utils
-from text.symbols import symbols
 import os
-import json
-def export_onnx(export_path, model_path, config_path):
-    hps = utils.get_hparams_from_file(config_path)
-    net_g = SynthesizerTrn(
-        len(symbols),
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        n_speakers=hps.data.n_speakers,
-        **hps.model,
-    )
-    _ = net_g.eval()
-    _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
-    net_g.export_onnx(export_path)
-    spklist = []
-    for key in hps.data.spk2id.keys():
-        spklist.append(key)
-    MoeVSConf = {
-        "Folder": f"{export_path}",
-        "Name": f"{export_path}",
-        "Type": "BertVits",
-        "Symbol": symbols,
-        "Cleaner": "",
-        "Rate": hps.data.sampling_rate,
-        "CharaMix": True,
-        "Characters": spklist,
-        "LanguageMap": {"ZH": [0, 0], "JP": [1, 6], "EN": [2, 8]},
-        "Dict": "BasicDict",
-        "BertPath": [
-            "chinese-roberta-wwm-ext-large",
-            "deberta-v2-large-japanese",
-            "bert-base-japanese-v3",
-        ],
-    }
-    with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile:
-        json.dump(MoeVSConf, MoeVsConfFile, indent=4)
 if __name__ == "__main__":
-    print(symbols)
-    export_path = "HimenoSena"
-    model_path = "G_53000.pth"
-    config_path = "config.json"
     if not os.path.exists("onnx"):
         os.makedirs("onnx")
     if not os.path.exists(f"onnx/{export_path}"):

+from onnx_modules import export_onnx
 import os
 if __name__ == "__main__":
+    export_path = "BertVits2.2PT"
+    model_path = "model\\G_0.pth"
+    config_path = "model\\config.json"
     if not os.path.exists("onnx"):
         os.makedirs("onnx")
     if not os.path.exists(f"onnx/{export_path}"):

img/yuyu.png ADDED Viewed

img//345/217/202/346/225/260/350/257/264/346/230/216.png ADDED Viewed

img//345/256/265/345/256/253.png ADDED Viewed

img//345/276/256/344/277/241/345/233/276/347/211/207_20231010105112.png ADDED Viewed

img//347/245/236/351/207/214/347/273/253/345/215/216.png ADDED Viewed

img//347/272/263/350/245/277/345/246/262.png ADDED Viewed

infer.py ADDED Viewed

	@@ -0,0 +1,381 @@

+"""
+版本管理、兼容推理及模型加载实现。
+版本说明：
+    1. 版本号与github的release版本号对应，使用哪个release版本训练的模型即对应其版本号
+    2. 请在模型的config.json中显示声明版本号，添加一个字段"version" : "你的版本号"
+特殊版本说明：
+    1.1.1-fix： 1.1.1版本训练的模型，但是在推理时使用dev的日语修复
+    2.2：当前版本
+"""
+import torch
+import commons
+from text import cleaned_text_to_sequence, get_bert
+from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
+from text.cleaner import clean_text
+import utils
+import numpy as np
+from models import SynthesizerTrn
+from text.symbols import symbols
+from oldVersion.V210.models import SynthesizerTrn as V210SynthesizerTrn
+from oldVersion.V210.text import symbols as V210symbols
+from oldVersion.V200.models import SynthesizerTrn as V200SynthesizerTrn
+from oldVersion.V200.text import symbols as V200symbols
+from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn
+from oldVersion.V111.text import symbols as V111symbols
+from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn
+from oldVersion.V110.text import symbols as V110symbols
+from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn
+from oldVersion.V101.text import symbols as V101symbols
+from oldVersion import V111, V110, V101, V200, V210
+# 当前版本信息
+latest_version = "2.2"
+# 版本兼容
+SynthesizerTrnMap = {
+    "2.1": V210SynthesizerTrn,
+    "2.0.2-fix": V200SynthesizerTrn,
+    "2.0.1": V200SynthesizerTrn,
+    "2.0": V200SynthesizerTrn,
+    "1.1.1-fix": V111SynthesizerTrn,
+    "1.1.1": V111SynthesizerTrn,
+    "1.1": V110SynthesizerTrn,
+    "1.1.0": V110SynthesizerTrn,
+    "1.0.1": V101SynthesizerTrn,
+    "1.0": V101SynthesizerTrn,
+    "1.0.0": V101SynthesizerTrn,
+}
+symbolsMap = {
+    "2.1": V210symbols,
+    "2.0.2-fix": V200symbols,
+    "2.0.1": V200symbols,
+    "2.0": V200symbols,
+    "1.1.1-fix": V111symbols,
+    "1.1.1": V111symbols,
+    "1.1": V110symbols,
+    "1.1.0": V110symbols,
+    "1.0.1": V101symbols,
+    "1.0": V101symbols,
+    "1.0.0": V101symbols,
+}
+# def get_emo_(reference_audio, emotion, sid):
+#     emo = (
+#         torch.from_numpy(get_emo(reference_audio))
+#         if reference_audio and emotion == -1
+#         else torch.FloatTensor(
+#             np.load(f"emo_clustering/{sid}/cluster_center_{emotion}.npy")
+#         )
+#     )
+#     return emo
+def get_net_g(model_path: str, version: str, device: str, hps):
+    if version != latest_version:
+        net_g = SynthesizerTrnMap[version](
+            len(symbolsMap[version]),
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            n_speakers=hps.data.n_speakers,
+            **hps.model,
+        ).to(device)
+    else:
+        # 当前版本模型 net_g
+        net_g = SynthesizerTrn(
+            len(symbols),
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            n_speakers=hps.data.n_speakers,
+            **hps.model,
+        ).to(device)
+    _ = net_g.eval()
+    _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
+    return net_g
+def get_text(text, language_str, hps, device):
+    # 在此处实现当前版本的get_text
+    norm_text, phone, tone, word2ph = clean_text(text, language_str)
+    phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+    if hps.data.add_blank:
+        phone = commons.intersperse(phone, 0)
+        tone = commons.intersperse(tone, 0)
+        language = commons.intersperse(language, 0)
+        for i in range(len(word2ph)):
+            word2ph[i] = word2ph[i] * 2
+        word2ph[0] += 1
+    bert_ori = get_bert(norm_text, word2ph, language_str, device)
+    del word2ph
+    assert bert_ori.shape[-1] == len(phone), phone
+    if language_str == "ZH":
+        bert = bert_ori
+        ja_bert = torch.rand(1024, len(phone))
+        en_bert = torch.rand(1024, len(phone))
+    elif language_str == "JP":
+        bert = torch.rand(1024, len(phone))
+        ja_bert = bert_ori
+        en_bert = torch.rand(1024, len(phone))
+    elif language_str == "EN":
+        bert = torch.rand(1024, len(phone))
+        ja_bert = torch.rand(1024, len(phone))
+        en_bert = bert_ori
+    else:
+        raise ValueError("language_str should be ZH, JP or EN")
+    assert bert.shape[-1] == len(
+        phone
+    ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
+    phone = torch.LongTensor(phone)
+    tone = torch.LongTensor(tone)
+    language = torch.LongTensor(language)
+    return bert, ja_bert, en_bert, phone, tone, language
+def infer(
+    text,
+    emotion,
+    sdp_ratio,
+    noise_scale,
+    noise_scale_w,
+    length_scale,
+    sid,
+    language,
+    hps,
+    net_g,
+    device,
+    reference_audio=None,
+    skip_start=False,
+    skip_end=False,
+):
+    # 2.2版本参数位置变了
+    # 2.1 参数新增 emotion reference_audio skip_start skip_end
+    inferMap_V3 = {
+        "2.1": V210.infer,
+    }
+    # 支持中日英三语版本
+    inferMap_V2 = {
+        "2.0.2-fix": V200.infer,
+        "2.0.1": V200.infer,
+        "2.0": V200.infer,
+        "1.1.1-fix": V111.infer_fix,
+        "1.1.1": V111.infer,
+        "1.1": V110.infer,
+        "1.1.0": V110.infer,
+    }
+    # 仅支持中文版本
+    # 在测试中，并未发现两个版本的模型不能互相通用
+    inferMap_V1 = {
+        "1.0.1": V101.infer,
+        "1.0": V101.infer,
+        "1.0.0": V101.infer,
+    }
+    version = hps.version if hasattr(hps, "version") else latest_version
+    # 非当前版本，根据版本号选择合适的infer
+    if version != latest_version:
+        if version in inferMap_V3.keys():
+            return inferMap_V3[version](
+                text,
+                sdp_ratio,
+                noise_scale,
+                noise_scale_w,
+                length_scale,
+                sid,
+                language,
+                hps,
+                net_g,
+                device,
+                reference_audio,
+                emotion,
+                skip_start,
+                skip_end,
+            )
+        if version in inferMap_V2.keys():
+            return inferMap_V2[version](
+                text,
+                sdp_ratio,
+                noise_scale,
+                noise_scale_w,
+                length_scale,
+                sid,
+                language,
+                hps,
+                net_g,
+                device,
+            )
+        if version in inferMap_V1.keys():
+            return inferMap_V1[version](
+                text,
+                sdp_ratio,
+                noise_scale,
+                noise_scale_w,
+                length_scale,
+                sid,
+                hps,
+                net_g,
+                device,
+            )
+    # 在此处实现当前版本的推理
+    # emo = get_emo_(reference_audio, emotion, sid)
+    if isinstance(reference_audio, np.ndarray):
+        emo = get_clap_audio_feature(reference_audio, device)
+    else:
+        emo = get_clap_text_feature(emotion, device)
+    emo = torch.squeeze(emo, dim=1)
+    bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
+        text, language, hps, device
+    )
+    if skip_start:
+        phones = phones[3:]
+        tones = tones[3:]
+        lang_ids = lang_ids[3:]
+        bert = bert[:, 3:]
+        ja_bert = ja_bert[:, 3:]
+        en_bert = en_bert[:, 3:]
+    if skip_end:
+        phones = phones[:-2]
+        tones = tones[:-2]
+        lang_ids = lang_ids[:-2]
+        bert = bert[:, :-2]
+        ja_bert = ja_bert[:, :-2]
+        en_bert = en_bert[:, :-2]
+    with torch.no_grad():
+        x_tst = phones.to(device).unsqueeze(0)
+        tones = tones.to(device).unsqueeze(0)
+        lang_ids = lang_ids.to(device).unsqueeze(0)
+        bert = bert.to(device).unsqueeze(0)
+        ja_bert = ja_bert.to(device).unsqueeze(0)
+        en_bert = en_bert.to(device).unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        emo = emo.to(device).unsqueeze(0)
+        del phones
+        speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
+        audio = (
+            net_g.infer(
+                x_tst,
+                x_tst_lengths,
+                speakers,
+                tones,
+                lang_ids,
+                bert,
+                ja_bert,
+                en_bert,
+                emo,
+                sdp_ratio=sdp_ratio,
+                noise_scale=noise_scale,
+                noise_scale_w=noise_scale_w,
+                length_scale=length_scale,
+            )[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )
+        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return audio
+def infer_multilang(
+    text,
+    sdp_ratio,
+    noise_scale,
+    noise_scale_w,
+    length_scale,
+    sid,
+    language,
+    hps,
+    net_g,
+    device,
+    reference_audio=None,
+    emotion=None,
+    skip_start=False,
+    skip_end=False,
+):
+    bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], []
+    # emo = get_emo_(reference_audio, emotion, sid)
+    if isinstance(reference_audio, np.ndarray):
+        emo = get_clap_audio_feature(reference_audio, device)
+    else:
+        emo = get_clap_text_feature(emotion, device)
+    emo = torch.squeeze(emo, dim=1)
+    for idx, (txt, lang) in enumerate(zip(text, language)):
+        skip_start = (idx != 0) or (skip_start and idx == 0)
+        skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1)
+        (
+            temp_bert,
+            temp_ja_bert,
+            temp_en_bert,
+            temp_phones,
+            temp_tones,
+            temp_lang_ids,
+        ) = get_text(txt, lang, hps, device)
+        if skip_start:
+            temp_bert = temp_bert[:, 3:]
+            temp_ja_bert = temp_ja_bert[:, 3:]
+            temp_en_bert = temp_en_bert[:, 3:]
+            temp_phones = temp_phones[3:]
+            temp_tones = temp_tones[3:]
+            temp_lang_ids = temp_lang_ids[3:]
+        if skip_end:
+            temp_bert = temp_bert[:, :-2]
+            temp_ja_bert = temp_ja_bert[:, :-2]
+            temp_en_bert = temp_en_bert[:, :-2]
+            temp_phones = temp_phones[:-2]
+            temp_tones = temp_tones[:-2]
+            temp_lang_ids = temp_lang_ids[:-2]
+        bert.append(temp_bert)
+        ja_bert.append(temp_ja_bert)
+        en_bert.append(temp_en_bert)
+        phones.append(temp_phones)
+        tones.append(temp_tones)
+        lang_ids.append(temp_lang_ids)
+    bert = torch.concatenate(bert, dim=1)
+    ja_bert = torch.concatenate(ja_bert, dim=1)
+    en_bert = torch.concatenate(en_bert, dim=1)
+    phones = torch.concatenate(phones, dim=0)
+    tones = torch.concatenate(tones, dim=0)
+    lang_ids = torch.concatenate(lang_ids, dim=0)
+    with torch.no_grad():
+        x_tst = phones.to(device).unsqueeze(0)
+        tones = tones.to(device).unsqueeze(0)
+        lang_ids = lang_ids.to(device).unsqueeze(0)
+        bert = bert.to(device).unsqueeze(0)
+        ja_bert = ja_bert.to(device).unsqueeze(0)
+        en_bert = en_bert.to(device).unsqueeze(0)
+        emo = emo.to(device).unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        del phones
+        speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
+        audio = (
+            net_g.infer(
+                x_tst,
+                x_tst_lengths,
+                speakers,
+                tones,
+                lang_ids,
+                bert,
+                ja_bert,
+                en_bert,
+                emo,
+                sdp_ratio=sdp_ratio,
+                noise_scale=noise_scale,
+                noise_scale_w=noise_scale_w,
+                length_scale=length_scale,
+            )[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )
+        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return audio

models.py CHANGED Viewed

@@ -10,11 +10,12 @@ import monotonic_align
 from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from vector_quantize_pytorch import VectorQuantize
 from commons import init_weights, get_padding
 from text import symbols, num_tones, num_languages
 class DurationDiscriminator(nn.Module):  # vits2
     def __init__(
@@ -311,6 +312,37 @@ class DurationPredictor(nn.Module):
         return x * x_mask
 class TextEncoder(nn.Module):
     def __init__(
         self,
@@ -344,18 +376,31 @@ class TextEncoder(nn.Module):
         self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.emo_proj = nn.Linear(1024, 1024)
-        self.emo_quantizer = [
-            VectorQuantize(
-                dim=1024,
-                codebook_size=10,
-                decay=0.8,
-                commitment_weight=1.0,
-                learnable_codebook=True,
-                ema_update=False,
-            )
-        ] * n_speakers
-        self.emo_q_proj = nn.Linear(1024, hidden_channels)
         self.encoder = attentions.Encoder(
             hidden_channels,
@@ -375,26 +420,11 @@ class TextEncoder(nn.Module):
         bert_emb = self.bert_proj(bert).transpose(1, 2)
         ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
         en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
-        if emo.size(-1) == 1024:
-            emo_emb = self.emo_proj(emo.unsqueeze(1))
-            emo_commit_loss = torch.zeros(1)
-            emo_emb_ = []
-            for i in range(emo_emb.size(0)):
-                temp_emo_emb, _, temp_emo_commit_loss = self.emo_quantizer[sid[i]](
-                    emo_emb[i].unsqueeze(0).cpu()
-                )
-                emo_commit_loss += temp_emo_commit_loss
-                emo_emb_.append(temp_emo_emb)
-            emo_emb = torch.cat(emo_emb_, dim=0).to(emo_emb.device)
-            emo_commit_loss = emo_commit_loss.to(emo_emb.device)
-        else:
-            emo_emb = (
-                self.emo_quantizer[sid[0]]
-                .get_output_from_indices(emo.to(torch.int).cpu())
-                .unsqueeze(0)
-                .to(emo.device)
-            )
-            emo_commit_loss = torch.zeros(1)
         x = (
             self.emb(x)
             + self.tone_emb(tone)
@@ -402,7 +432,7 @@ class TextEncoder(nn.Module):
             + bert_emb
             + ja_bert_emb
             + en_bert_emb
-            + self.emo_q_proj(emo_emb)
         ) * math.sqrt(
             self.hidden_channels
         )  # [b, t, h]
@@ -415,7 +445,7 @@ class TextEncoder(nn.Module):
         stats = self.proj(x) * x_mask
         m, logs = torch.split(stats, self.out_channels, dim=1)
-        return x, m, logs, x_mask, emo_commit_loss
 class ResidualCouplingBlock(nn.Module):
@@ -989,6 +1019,7 @@ class SynthesizerTrn(nn.Module):
             y_mask,
             (z, z_p, m_p, logs_p, m_q, logs_q),
             (x, logw, logw_),
             loss_commit,
         )

 from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from commons import init_weights, get_padding
 from text import symbols, num_tones, num_languages
+from vector_quantize_pytorch import VectorQuantize
 class DurationDiscriminator(nn.Module):  # vits2
     def __init__(
         return x * x_mask
+class Bottleneck(nn.Sequential):
+    def __init__(self, in_dim, hidden_dim):
+        c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
+        c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
+        super().__init__(*[c_fc1, c_fc2])
+class Block(nn.Module):
+    def __init__(self, in_dim, hidden_dim) -> None:
+        super().__init__()
+        self.norm = nn.LayerNorm(in_dim)
+        self.mlp = MLP(in_dim, hidden_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.mlp(self.norm(x))
+        return x
+class MLP(nn.Module):
+    def __init__(self, in_dim, hidden_dim):
+        super().__init__()
+        self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
+        self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
+        self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
+    def forward(self, x: torch.Tensor):
+        x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
+        x = self.c_proj(x)
+        return x
 class TextEncoder(nn.Module):
     def __init__(
         self,
         self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        # self.emo_proj = nn.Linear(512, hidden_channels)
+        self.in_feature_net = nn.Sequential(
+            # input is assumed to an already normalized embedding
+            nn.Linear(512, 1028, bias=False),
+            nn.GELU(),
+            nn.LayerNorm(1028),
+            *[Block(1028, 512) for _ in range(1)],
+            nn.Linear(1028, 512, bias=False),
+            # normalize before passing to VQ?
+            # nn.GELU(),
+            # nn.LayerNorm(512),
+        )
+        self.emo_vq = VectorQuantize(
+            dim=512,
+            codebook_size=64,
+            codebook_dim=32,
+            commitment_weight=0.1,
+            decay=0.85,
+            heads=32,
+            kmeans_iters=20,
+            separate_codebook_per_head=True,
+            stochastic_sample_codes=True,
+            threshold_ema_dead_code=2,
+        )
+        self.out_feature_net = nn.Linear(512, hidden_channels)
         self.encoder = attentions.Encoder(
             hidden_channels,
         bert_emb = self.bert_proj(bert).transpose(1, 2)
         ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
         en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
+        emo_emb = self.in_feature_net(emo)
+        emo_emb, _, loss_commit = self.emo_vq(emo_emb.unsqueeze(1))
+        loss_commit = loss_commit.mean()
+        emo_emb = self.out_feature_net(emo_emb)
+        # emo_emb = self.emo_proj(emo.unsqueeze(1))
         x = (
             self.emb(x)
             + self.tone_emb(tone)
             + bert_emb
             + ja_bert_emb
             + en_bert_emb
+            + emo_emb
         ) * math.sqrt(
             self.hidden_channels
         )  # [b, t, h]
         stats = self.proj(x) * x_mask
         m, logs = torch.split(stats, self.out_channels, dim=1)
+        return x, m, logs, x_mask, loss_commit
 class ResidualCouplingBlock(nn.Module):
             y_mask,
             (z, z_p, m_p, logs_p, m_q, logs_q),
             (x, logw, logw_),
+            g,
             loss_commit,
         )

monotonic_align/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.64 kB). View file

monotonic_align/__pycache__/core.cpython-311.pyc ADDED Viewed

Binary file (2 kB). View file

onnx_modules/V200/__init__.py ADDED Viewed

File without changes

onnx_modules/V200/attentions_onnx.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import commons
+import logging
+logger = logging.getLogger(__name__)
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=4,
+        isflow=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        # if isflow:
+        #  cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
+        #  self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
+        #  self.cond_layer = weight_norm(cond_layer, name='weight')
+        #  self.gin_channels = 256
+        self.cond_layer_idx = self.n_layers
+        if "gin_channels" in kwargs:
+            self.gin_channels = kwargs["gin_channels"]
+            if self.gin_channels != 0:
+                self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
+                # vits2 says 3rd block, so idx is 2 by default
+                self.cond_layer_idx = (
+                    kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
+                )
+                logging.debug(self.gin_channels, self.cond_layer_idx)
+                assert (
+                    self.cond_layer_idx < self.n_layers
+                ), "cond_layer_idx should be less than n_layers"
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask, g=None):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            if i == self.cond_layer_idx and g is not None:
+                g = self.spk_emb_linear(g.transpose(1, 2))
+                g = g.transpose(1, 2)
+                x = x + g
+                x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query / math.sqrt(self.k_channels), key_relative_embeddings
+            )
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(
+                device=scores.device, dtype=scores.dtype
+            )
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (
+                    t_s == t_t
+                ), "Local attention is only available for self-attention."
+                block_mask = (
+                    torch.ones_like(scores)
+                    .triu(-self.block_length)
+                    .tril(self.block_length)
+                )
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = (
+            output.transpose(2, 3).contiguous().view(b, d, t_t)
+        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(
+            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+        )
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(
+            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+        )
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation=None,
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x

onnx_modules/V200/models_onnx.py ADDED Viewed

	@@ -0,0 +1,990 @@

+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import commons
+import modules
+from . import attentions_onnx
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from commons import init_weights, get_padding
+from .text import symbols, num_tones, num_languages
+class DurationDiscriminator(nn.Module):  # vits2
+    def __init__(
+        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.dur_proj = nn.Conv1d(1, filter_channels, 1)
+        self.pre_out_conv_1 = nn.Conv1d(
+            2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
+        self.pre_out_conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+        self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
+    def forward_probability(self, x, x_mask, dur, g=None):
+        dur = self.dur_proj(dur)
+        x = torch.cat([x, dur], dim=1)
+        x = self.pre_out_conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.pre_out_norm_1(x)
+        x = self.drop(x)
+        x = self.pre_out_conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.pre_out_norm_2(x)
+        x = self.drop(x)
+        x = x * x_mask
+        x = x.transpose(1, 2)
+        output_prob = self.output_layer(x)
+        return output_prob
+    def forward(self, x, x_mask, dur_r, dur_hat, g=None):
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        output_probs = []
+        for dur in [dur_r, dur_hat]:
+            output_prob = self.forward_probability(x, x_mask, dur, g)
+            output_probs.append(output_prob)
+        return output_probs
+class TransformerCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        n_flows=4,
+        gin_channels=0,
+        share_parameter=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        self.wn = (
+            attentions_onnx.FFT(
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers,
+                kernel_size,
+                p_dropout,
+                isflow=True,
+                gin_channels=self.gin_channels,
+            )
+            if share_parameter
+            else None
+        )
+        for i in range(n_flows):
+            self.flows.append(
+                modules.TransformerCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    n_layers,
+                    n_heads,
+                    p_dropout,
+                    filter_channels,
+                    mean_only=True,
+                    wn_sharing_parameter=self.wn,
+                    gin_channels=self.gin_channels,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=True):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class StochasticDurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        filter_channels = in_channels  # it needs to be removed from future version.
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.log_flow = modules.Log()
+        self.flows = nn.ModuleList()
+        self.flows.append(modules.ElementwiseAffine(2))
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+            )
+            self.flows.append(modules.Flip())
+        self.post_pre = nn.Conv1d(1, filter_channels, 1)
+        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.post_convs = modules.DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        self.post_flows = nn.ModuleList()
+        self.post_flows.append(modules.ElementwiseAffine(2))
+        for i in range(4):
+            self.post_flows.append(
+                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+            )
+            self.post_flows.append(modules.Flip())
+        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.convs = modules.DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+    def forward(self, x, x_mask, z, g=None):
+        x = torch.detach(x)
+        x = self.pre(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.convs(x, x_mask)
+        x = self.proj(x) * x_mask
+        flows = list(reversed(self.flows))
+        flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
+        for flow in flows:
+            z = flow(z, x_mask, g=x, reverse=True)
+        z0, z1 = torch.split(z, [1, 1], 1)
+        logw = z0
+        return logw
+class DurationPredictor(nn.Module):
+    def __init__(
+        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.proj = nn.Conv1d(filter_channels, 1, 1)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+    def forward(self, x, x_mask, g=None):
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        x = self.proj(x * x_mask)
+        return x * x_mask
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        n_vocab,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.emb = nn.Embedding(len(symbols), hidden_channels)
+        nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+        self.tone_emb = nn.Embedding(num_tones, hidden_channels)
+        nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5)
+        self.language_emb = nn.Embedding(num_languages, hidden_channels)
+        nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
+        self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.encoder = attentions_onnx.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            gin_channels=self.gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, tone, language, bert, ja_bert, en_bert, g=None):
+        x_mask = torch.ones_like(x).unsqueeze(0)
+        bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2)
+        ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose(
+            1, 2
+        )
+        en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose(
+            1, 2
+        )
+        x = (
+            self.emb(x)
+            + self.tone_emb(tone)
+            + self.language_emb(language)
+            + bert_emb
+            + ja_bert_emb
+            + en_bert_emb
+        ) * math.sqrt(
+            self.hidden_channels
+        )  # [b, t, h]
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = x_mask.to(x.dtype)
+        x = self.encoder(x * x_mask, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return x, m, logs, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=True):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for layer in self.ups:
+            remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for layer in self.convs:
+            x = layer(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for layer in self.convs:
+            x = layer(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class ReferenceEncoder(nn.Module):
+    """
+    inputs --- [N, Ty/r, n_mels*r]  mels
+    outputs --- [N, ref_enc_gru_size]
+    """
+    def __init__(self, spec_channels, gin_channels=0):
+        super().__init__()
+        self.spec_channels = spec_channels
+        ref_enc_filters = [32, 32, 64, 64, 128, 128]
+        K = len(ref_enc_filters)
+        filters = [1] + ref_enc_filters
+        convs = [
+            weight_norm(
+                nn.Conv2d(
+                    in_channels=filters[i],
+                    out_channels=filters[i + 1],
+                    kernel_size=(3, 3),
+                    stride=(2, 2),
+                    padding=(1, 1),
+                )
+            )
+            for i in range(K)
+        ]
+        self.convs = nn.ModuleList(convs)
+        # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
+        out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
+        self.gru = nn.GRU(
+            input_size=ref_enc_filters[-1] * out_channels,
+            hidden_size=256 // 2,
+            batch_first=True,
+        )
+        self.proj = nn.Linear(128, gin_channels)
+    def forward(self, inputs, mask=None):
+        N = inputs.size(0)
+        out = inputs.view(N, 1, -1, self.spec_channels)  # [N, 1, Ty, n_freqs]
+        for conv in self.convs:
+            out = conv(out)
+            # out = wn(out)
+            out = F.relu(out)  # [N, 128, Ty//2^K, n_mels//2^K]
+        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
+        T = out.size(1)
+        N = out.size(0)
+        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
+        self.gru.flatten_parameters()
+        memory, out = self.gru(out)  # out --- [1, N, 128]
+        return self.proj(out.squeeze(0))
+    def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
+        for i in range(n_convs):
+            L = (L - kernel_size + 2 * pad) // stride + 1
+        return L
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+    def __init__(
+        self,
+        n_vocab,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        n_speakers=256,
+        gin_channels=256,
+        use_sdp=True,
+        n_flow_layer=4,
+        n_layers_trans_flow=4,
+        flow_share_parameter=False,
+        use_transformer_flow=True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.n_speakers = n_speakers
+        self.gin_channels = gin_channels
+        self.n_layers_trans_flow = n_layers_trans_flow
+        self.use_spk_conditioned_encoder = kwargs.get(
+            "use_spk_conditioned_encoder", True
+        )
+        self.use_sdp = use_sdp
+        self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
+        self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
+        self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
+        self.current_mas_noise_scale = self.mas_noise_scale_initial
+        if self.use_spk_conditioned_encoder and gin_channels > 0:
+            self.enc_gin_channels = gin_channels
+        self.enc_p = TextEncoder(
+            n_vocab,
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            gin_channels=self.enc_gin_channels,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        if use_transformer_flow:
+            self.flow = TransformerCouplingBlock(
+                inter_channels,
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers_trans_flow,
+                5,
+                p_dropout,
+                n_flow_layer,
+                gin_channels=gin_channels,
+                share_parameter=flow_share_parameter,
+            )
+        else:
+            self.flow = ResidualCouplingBlock(
+                inter_channels,
+                hidden_channels,
+                5,
+                1,
+                n_flow_layer,
+                gin_channels=gin_channels,
+            )
+        self.sdp = StochasticDurationPredictor(
+            hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
+        )
+        self.dp = DurationPredictor(
+            hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
+        )
+        if n_speakers >= 1:
+            self.emb_g = nn.Embedding(n_speakers, gin_channels)
+        else:
+            self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
+    def export_onnx(
+        self,
+        path,
+        max_len=None,
+        sdp_ratio=0,
+        y=None,
+    ):
+        noise_scale = 0.667
+        length_scale = 1
+        noise_scale_w = 0.8
+        x = (
+            torch.LongTensor(
+                [
+                    0,
+                    97,
+                    0,
+                    8,
+                    0,
+                    78,
+                    0,
+                    8,
+                    0,
+                    76,
+                    0,
+                    37,
+                    0,
+                    40,
+                    0,
+                    97,
+                    0,
+                    8,
+                    0,
+                    23,
+                    0,
+                    8,
+                    0,
+                    74,
+                    0,
+                    26,
+                    0,
+                    104,
+                    0,
+                ]
+            )
+            .unsqueeze(0)
+            .cpu()
+        )
+        tone = torch.zeros_like(x).cpu()
+        language = torch.zeros_like(x).cpu()
+        x_lengths = torch.LongTensor([x.shape[1]]).cpu()
+        sid = torch.LongTensor([0]).cpu()
+        bert = torch.randn(size=(x.shape[1], 1024)).cpu()
+        ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
+        en_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
+        if self.n_speakers > 0:
+            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
+            torch.onnx.export(
+                self.emb_g,
+                (sid),
+                f"onnx/{path}/{path}_emb.onnx",
+                input_names=["sid"],
+                output_names=["g"],
+                verbose=True,
+            )
+        else:
+            g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
+        torch.onnx.export(
+            self.enc_p,
+            (x, x_lengths, tone, language, bert, ja_bert, en_bert, g),
+            f"onnx/{path}/{path}_enc_p.onnx",
+            input_names=[
+                "x",
+                "x_lengths",
+                "t",
+                "language",
+                "bert_0",
+                "bert_1",
+                "bert_2",
+                "g",
+            ],
+            output_names=["xout", "m_p", "logs_p", "x_mask"],
+            dynamic_axes={
+                "x": [0, 1],
+                "t": [0, 1],
+                "language": [0, 1],
+                "bert_0": [0],
+                "bert_1": [0],
+                "bert_2": [0],
+                "xout": [0, 2],
+                "m_p": [0, 2],
+                "logs_p": [0, 2],
+                "x_mask": [0, 2],
+            },
+            verbose=True,
+            opset_version=16,
+        )
+        x, m_p, logs_p, x_mask = self.enc_p(
+            x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g
+        )
+        zinput = (
+            torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
+            * noise_scale_w
+        )
+        torch.onnx.export(
+            self.sdp,
+            (x, x_mask, zinput, g),
+            f"onnx/{path}/{path}_sdp.onnx",
+            input_names=["x", "x_mask", "zin", "g"],
+            output_names=["logw"],
+            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]},
+            verbose=True,
+        )
+        torch.onnx.export(
+            self.dp,
+            (x, x_mask, g),
+            f"onnx/{path}/{path}_dp.onnx",
+            input_names=["x", "x_mask", "g"],
+            output_names=["logw"],
+            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]},
+            verbose=True,
+        )
+        logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp(
+            x, x_mask, g=g
+        ) * (1 - sdp_ratio)
+        w = torch.exp(logw) * x_mask * length_scale
+        w_ceil = torch.ceil(w)
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
+            x_mask.dtype
+        )
+        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+        attn = commons.generate_path(w_ceil, attn_mask)
+        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+        torch.onnx.export(
+            self.flow,
+            (z_p, y_mask, g),
+            f"onnx/{path}/{path}_flow.onnx",
+            input_names=["z_p", "y_mask", "g"],
+            output_names=["z"],
+            dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]},
+            verbose=True,
+        )
+        z = self.flow(z_p, y_mask, g=g, reverse=True)
+        z_in = (z * y_mask)[:, :, :max_len]
+        torch.onnx.export(
+            self.dec,
+            (z_in, g),
+            f"onnx/{path}/{path}_dec.onnx",
+            input_names=["z_in", "g"],
+            output_names=["o"],
+            dynamic_axes={"z_in": [0, 2], "o": [0, 2]},
+            verbose=True,
+        )
+        o = self.dec((z * y_mask)[:, :, :max_len], g=g)

onnx_modules/V200/text/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .symbols import *

onnx_modules/V200/text/bert_utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from pathlib import Path
+from huggingface_hub import hf_hub_download
+from config import config
+MIRROR: str = config.mirror
+def _check_bert(repo_id, files, local_path):
+    for file in files:
+        if not Path(local_path).joinpath(file).exists():
+            if MIRROR.lower() == "openi":
+                import openi
+                openi.model.download_model(
+                    "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
+                )
+            else:
+                hf_hub_download(
+                    repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
+                )

onnx_modules/V200/text/chinese.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import os
+import re
+import cn2an
+from pypinyin import lazy_pinyin, Style
+from .symbols import punctuation
+from .tone_sandhi import ToneSandhi
+current_file_path = os.path.dirname(__file__)
+pinyin_to_symbol_map = {
+    line.split("\t")[0]: line.strip().split("\t")[1]
+    for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
+}
+import jieba.posseg as psg
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": "…",
+    "$": ".",
+    "“": "'",
+    "”": "'",
+    "‘": "'",
+    "’": "'",
+    "（": "'",
+    "）": "'",
+    "(": "'",
+    ")": "'",
+    "《": "'",
+    "》": "'",
+    "【": "'",
+    "】": "'",
+    "[": "'",
+    "]": "'",
+    "—": "-",
+    "～": "-",
+    "~": "-",
+    "「": "'",
+    "」": "'",
+}
+tone_modifier = ToneSandhi()
+def replace_punctuation(text):
+    text = text.replace("嗯", "恩").replace("呣", "母")
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+    replaced_text = re.sub(
+        r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
+    )
+    return replaced_text
+def g2p(text):
+    pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
+    sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
+    phones, tones, word2ph = _g2p(sentences)
+    assert sum(word2ph) == len(phones)
+    assert len(word2ph) == len(text)  # Sometimes it will crash,you can add a try-catch.
+    phones = ["_"] + phones + ["_"]
+    tones = [0] + tones + [0]
+    word2ph = [1] + word2ph + [1]
+    return phones, tones, word2ph
+def _get_initials_finals(word):
+    initials = []
+    finals = []
+    orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
+    orig_finals = lazy_pinyin(
+        word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
+    )
+    for c, v in zip(orig_initials, orig_finals):
+        initials.append(c)
+        finals.append(v)
+    return initials, finals
+def _g2p(segments):
+    phones_list = []
+    tones_list = []
+    word2ph = []
+    for seg in segments:
+        # Replace all English words in the sentence
+        seg = re.sub("[a-zA-Z]+", "", seg)
+        seg_cut = psg.lcut(seg)
+        initials = []
+        finals = []
+        seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
+        for word, pos in seg_cut:
+            if pos == "eng":
+                continue
+            sub_initials, sub_finals = _get_initials_finals(word)
+            sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
+            initials.append(sub_initials)
+            finals.append(sub_finals)
+            # assert len(sub_initials) == len(sub_finals) == len(word)
+        initials = sum(initials, [])
+        finals = sum(finals, [])
+        #
+        for c, v in zip(initials, finals):
+            raw_pinyin = c + v
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c == v:
+                assert c in punctuation
+                phone = [c]
+                tone = "0"
+                word2ph.append(1)
+            else:
+                v_without_tone = v[:-1]
+                tone = v[-1]
+                pinyin = c + v_without_tone
+                assert tone in "12345"
+                if c:
+                    # 多音节
+                    v_rep_map = {
+                        "uei": "ui",
+                        "iou": "iu",
+                        "uen": "un",
+                    }
+                    if v_without_tone in v_rep_map.keys():
+                        pinyin = c + v_rep_map[v_without_tone]
+                else:
+                    # 单音节
+                    pinyin_rep_map = {
+                        "ing": "ying",
+                        "i": "yi",
+                        "in": "yin",
+                        "u": "wu",
+                    }
+                    if pinyin in pinyin_rep_map.keys():
+                        pinyin = pinyin_rep_map[pinyin]
+                    else:
+                        single_rep_map = {
+                            "v": "yu",
+                            "e": "e",
+                            "i": "y",
+                            "u": "w",
+                        }
+                        if pinyin[0] in single_rep_map.keys():
+                            pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
+                assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
+                phone = pinyin_to_symbol_map[pinyin].split(" ")
+                word2ph.append(len(phone))
+            phones_list += phone
+            tones_list += [int(tone)] * len(phone)
+    return phones_list, tones_list, word2ph
+def text_normalize(text):
+    numbers = re.findall(r"\d+(?:\.?\d+)?", text)
+    for number in numbers:
+        text = text.replace(number, cn2an.an2cn(number), 1)
+    text = replace_punctuation(text)
+    return text
+def get_bert_feature(text, word2ph):
+    from text import chinese_bert
+    return chinese_bert.get_bert_feature(text, word2ph)
+if __name__ == "__main__":
+    from text.chinese_bert import get_bert_feature
+    text = "啊！但是《原神》是由,米哈\游自主，  [研发]的一款全.新开放世界.冒险游戏"
+    text = text_normalize(text)
+    print(text)
+    phones, tones, word2ph = g2p(text)
+    bert = get_bert_feature(text, word2ph)
+    print(phones, tones, word2ph, bert.shape)
+# # 示例用法
+# text = "这是一个示例文本：,你好！这是一个测试...."
+# print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试

onnx_modules/V200/text/chinese_bert.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import sys
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from config import config
+LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
+tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
+models = dict()
+def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
+    with torch.no_grad():
+        inputs = tokenizer(text, return_tensors="pt")
+        for i in inputs:
+            inputs[i] = inputs[i].to(device)
+        res = models[device](**inputs, output_hidden_states=True)
+        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
+    assert len(word2ph) == len(text) + 2
+    word2phone = word2ph
+    phone_level_feature = []
+    for i in range(len(word2phone)):
+        repeat_feature = res[i].repeat(word2phone[i], 1)
+        phone_level_feature.append(repeat_feature)
+    phone_level_feature = torch.cat(phone_level_feature, dim=0)
+    return phone_level_feature.T
+if __name__ == "__main__":
+    word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
+    word2phone = [
+        1,
+        2,
+        1,
+        2,
+        2,
+        1,
+        2,
+        2,
+        1,
+        2,
+        2,
+        1,
+        2,
+        2,
+        2,
+        2,
+        2,
+        1,
+        1,
+        2,
+        2,
+        1,
+        2,
+        2,
+        2,
+        2,
+        1,
+        2,
+        2,
+        2,
+        2,
+        2,
+        1,
+        2,
+        2,
+        2,
+        2,
+        1,
+    ]
+    # 计算总帧数
+    total_frames = sum(word2phone)
+    print(word_level_feature.shape)
+    print(word2phone)
+    phone_level_feature = []
+    for i in range(len(word2phone)):
+        print(word_level_feature[i].shape)
+        # 对每个词重复word2phone[i]次
+        repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
+        phone_level_feature.append(repeat_feature)
+    phone_level_feature = torch.cat(phone_level_feature, dim=0)
+    print(phone_level_feature.shape)  # torch.Size([36, 1024])

onnx_modules/V200/text/cleaner.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from . import chinese, japanese, english, cleaned_text_to_sequence
+language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
+def clean_text(text, language):
+    language_module = language_module_map[language]
+    norm_text = language_module.text_normalize(text)
+    phones, tones, word2ph = language_module.g2p(norm_text)
+    return norm_text, phones, tones, word2ph
+def clean_text_bert(text, language):
+    language_module = language_module_map[language]
+    norm_text = language_module.text_normalize(text)
+    phones, tones, word2ph = language_module.g2p(norm_text)
+    bert = language_module.get_bert_feature(norm_text, word2ph)
+    return phones, tones, bert
+def text_to_sequence(text, language):
+    norm_text, phones, tones, word2ph = clean_text(text, language)
+    return cleaned_text_to_sequence(phones, tones, language)
+if __name__ == "__main__":
+    pass

onnx_modules/V200/text/english.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import pickle
+import os
+import re
+from g2p_en import G2p
+from . import symbols
+current_file_path = os.path.dirname(__file__)
+CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
+CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
+_g2p = G2p()
+arpa = {
+    "AH0",
+    "S",
+    "AH1",
+    "EY2",
+    "AE2",
+    "EH0",
+    "OW2",
+    "UH0",
+    "NG",
+    "B",
+    "G",
+    "AY0",
+    "M",
+    "AA0",
+    "F",
+    "AO0",
+    "ER2",
+    "UH1",
+    "IY1",
+    "AH2",
+    "DH",
+    "IY0",
+    "EY1",
+    "IH0",
+    "K",
+    "N",
+    "W",
+    "IY2",
+    "T",
+    "AA1",
+    "ER1",
+    "EH2",
+    "OY0",
+    "UH2",
+    "UW1",
+    "Z",
+    "AW2",
+    "AW1",
+    "V",
+    "UW2",
+    "AA2",
+    "ER",
+    "AW0",
+    "UW0",
+    "R",
+    "OW1",
+    "EH1",
+    "ZH",
+    "AE0",
+    "IH2",
+    "IH",
+    "Y",
+    "JH",
+    "P",
+    "AY1",
+    "EY0",
+    "OY2",
+    "TH",
+    "HH",
+    "D",
+    "ER0",
+    "CH",
+    "AO1",
+    "AE1",
+    "AO2",
+    "OY1",
+    "AY2",
+    "IH1",
+    "OW0",
+    "L",
+    "SH",
+}
+def post_replace_ph(ph):
+    rep_map = {
+        "：": ",",
+        "；": ",",
+        "，": ",",
+        "。": ".",
+        "！": "!",
+        "？": "?",
+        "\n": ".",
+        "·": ",",
+        "、": ",",
+        "...": "…",
+        "v": "V",
+    }
+    if ph in rep_map.keys():
+        ph = rep_map[ph]
+    if ph in symbols:
+        return ph
+    if ph not in symbols:
+        ph = "UNK"
+    return ph
+def read_dict():
+    g2p_dict = {}
+    start_line = 49
+    with open(CMU_DICT_PATH) as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= start_line:
+                line = line.strip()
+                word_split = line.split("  ")
+                word = word_split[0]
+                syllable_split = word_split[1].split(" - ")
+                g2p_dict[word] = []
+                for syllable in syllable_split:
+                    phone_split = syllable.split(" ")
+                    g2p_dict[word].append(phone_split)
+            line_index = line_index + 1
+            line = f.readline()
+    return g2p_dict
+def cache_dict(g2p_dict, file_path):
+    with open(file_path, "wb") as pickle_file:
+        pickle.dump(g2p_dict, pickle_file)
+def get_dict():
+    if os.path.exists(CACHE_PATH):
+        with open(CACHE_PATH, "rb") as pickle_file:
+            g2p_dict = pickle.load(pickle_file)
+    else:
+        g2p_dict = read_dict()
+        cache_dict(g2p_dict, CACHE_PATH)
+    return g2p_dict
+eng_dict = get_dict()
+def refine_ph(phn):
+    tone = 0
+    if re.search(r"\d$", phn):
+        tone = int(phn[-1]) + 1
+        phn = phn[:-1]
+    return phn.lower(), tone
+def refine_syllables(syllables):
+    tones = []
+    phonemes = []
+    for phn_list in syllables:
+        for i in range(len(phn_list)):
+            phn = phn_list[i]
+            phn, tone = refine_ph(phn)
+            phonemes.append(phn)
+            tones.append(tone)
+    return phonemes, tones
+import re
+import inflect
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+# List of (ipa, lazy ipa) pairs:
+_lazy_ipa = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        ("r", "ɹ"),
+        ("æ", "e"),
+        ("ɑ", "a"),
+        ("ɔ", "o"),
+        ("ð", "z"),
+        ("θ", "s"),
+        ("ɛ", "e"),
+        ("ɪ", "i"),
+        ("ʊ", "u"),
+        ("ʒ", "ʥ"),
+        ("ʤ", "ʥ"),
+        ("ˈ", "↓"),
+    ]
+]
+# List of (ipa, lazy ipa2) pairs:
+_lazy_ipa2 = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        ("r", "ɹ"),
+        ("ð", "z"),
+        ("θ", "s"),
+        ("ʒ", "ʑ"),
+        ("ʤ", "dʑ"),
+        ("ˈ", "↓"),
+    ]
+]
+# List of (ipa, ipa2) pairs
+_ipa_to_ipa2 = [
+    (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
+]
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(
+                num, andword="", zero="oh", group=2
+            ).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
+def text_normalize(text):
+    text = normalize_numbers(text)
+    return text
+def g2p(text):
+    phones = []
+    tones = []
+    word2ph = []
+    words = re.split(r"([,;.\-\?\!\s+])", text)
+    words = [word for word in words if word.strip() != ""]
+    for word in words:
+        if word.upper() in eng_dict:
+            phns, tns = refine_syllables(eng_dict[word.upper()])
+            phones += phns
+            tones += tns
+            word2ph.append(len(phns))
+        else:
+            phone_list = list(filter(lambda p: p != " ", _g2p(word)))
+            for ph in phone_list:
+                if ph in arpa:
+                    ph, tn = refine_ph(ph)
+                    phones.append(ph)
+                    tones.append(tn)
+                else:
+                    phones.append(ph)
+                    tones.append(0)
+            word2ph.append(len(phone_list))
+    phones = [post_replace_ph(i) for i in phones]
+    phones = ["_"] + phones + ["_"]
+    tones = [0] + tones + [0]
+    word2ph = [1] + word2ph + [1]
+    return phones, tones, word2ph
+def get_bert_feature(text, word2ph):
+    from text import english_bert_mock
+    return english_bert_mock.get_bert_feature(text, word2ph)
+if __name__ == "__main__":
+    # print(get_dict())
+    # print(eng_word_to_phoneme("hello"))
+    print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
+    # all_phones = set()
+    # for k, syllables in eng_dict.items():
+    #     for group in syllables:
+    #         for ph in group:
+    #             all_phones.add(ph)
+    # print(all_phones)

onnx_modules/V200/text/english_bert_mock.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import sys
+import torch
+from transformers import DebertaV2Model, DebertaV2Tokenizer
+from config import config
+LOCAL_PATH = "./bert/deberta-v3-large"
+tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
+models = dict()
+def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
+    with torch.no_grad():
+        inputs = tokenizer(text, return_tensors="pt")
+        for i in inputs:
+            inputs[i] = inputs[i].to(device)
+        res = models[device](**inputs, output_hidden_states=True)
+        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
+    # assert len(word2ph) == len(text)+2
+    word2phone = word2ph
+    phone_level_feature = []
+    for i in range(len(word2phone)):
+        repeat_feature = res[i].repeat(word2phone[i], 1)
+        phone_level_feature.append(repeat_feature)
+    phone_level_feature = torch.cat(phone_level_feature, dim=0)
+    return phone_level_feature.T