Mahiruoshi commited on
Commit
e6cd719
1 Parent(s): 45d2fcd

Upload 159 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +17 -0
  2. .gitmodules +0 -0
  3. .pre-commit-config.yaml +4 -4
  4. Data/BangDreamV22/configs/config.json +197 -0
  5. Data/BangDreamV22/models/G_51000.pth +3 -0
  6. README.md +42 -13
  7. app.py +20 -43
  8. bert_gen.py +8 -6
  9. clap_gen.py +64 -0
  10. clap_wrapper.py +49 -0
  11. commons.py +6 -14
  12. compress_model.py +89 -0
  13. config.py +7 -2
  14. config.yml +25 -22
  15. configs/config.json +220 -217
  16. css/custom.css +18 -0
  17. data_utils.py +19 -8
  18. default_config.yml +15 -12
  19. emotional/clap-htsat-fused/.gitattributes +34 -0
  20. emotional/clap-htsat-fused/README.md +107 -0
  21. emotional/clap-htsat-fused/config.json +207 -0
  22. emotional/clap-htsat-fused/merges.txt +0 -0
  23. emotional/clap-htsat-fused/preprocessor_config.json +22 -0
  24. emotional/clap-htsat-fused/pytorch_model.bin +3 -0
  25. emotional/clap-htsat-fused/special_tokens_map.json +15 -0
  26. emotional/clap-htsat-fused/tokenizer.json +0 -0
  27. emotional/clap-htsat-fused/tokenizer_config.json +16 -0
  28. emotional/clap-htsat-fused/vocab.json +0 -0
  29. empty_emo.npy +3 -0
  30. export_onnx.py +4 -48
  31. img/yuyu.png +0 -0
  32. img//345/217/202/346/225/260/350/257/264/346/230/216.png +0 -0
  33. img//345/256/265/345/256/253.png +0 -0
  34. img//345/276/256/344/277/241/345/233/276/347/211/207_20231010105112.png +0 -0
  35. img//347/245/236/351/207/214/347/273/253/345/215/216.png +0 -0
  36. img//347/272/263/350/245/277/345/246/262.png +0 -0
  37. infer.py +381 -0
  38. models.py +66 -35
  39. monotonic_align/__pycache__/__init__.cpython-311.pyc +0 -0
  40. monotonic_align/__pycache__/core.cpython-311.pyc +0 -0
  41. onnx_modules/V200/__init__.py +0 -0
  42. onnx_modules/V200/attentions_onnx.py +378 -0
  43. onnx_modules/V200/models_onnx.py +990 -0
  44. onnx_modules/V200/text/__init__.py +1 -0
  45. onnx_modules/V200/text/bert_utils.py +23 -0
  46. onnx_modules/V200/text/chinese.py +198 -0
  47. onnx_modules/V200/text/chinese_bert.py +101 -0
  48. onnx_modules/V200/text/cleaner.py +28 -0
  49. onnx_modules/V200/text/english.py +362 -0
  50. onnx_modules/V200/text/english_bert_mock.py +42 -0
.gitignore CHANGED
@@ -166,3 +166,20 @@ cython_debug/
166
  filelists/*
167
  !/filelists/esd.list
168
  data/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  filelists/*
167
  !/filelists/esd.list
168
  data/*
169
+ /*.yml
170
+ !/default_config.yml
171
+ /Web/
172
+ /emotional/*/*.bin
173
+ /bert/*/*.bin
174
+ /bert/*/*.h5
175
+ /bert/*/*.model
176
+ /bert/*/*.safetensors
177
+ /bert/*/*.msgpack
178
+ asr_transcript.py
179
+ extract_list.py
180
+ dataset
181
+ /Data
182
+ Model
183
+ raw/
184
+ logs/
185
+ Data/*
.gitmodules ADDED
File without changes
.pre-commit-config.yaml CHANGED
@@ -1,24 +1,24 @@
1
  repos:
2
  - repo: https://github.com/pre-commit/pre-commit-hooks
3
- rev: v4.4.0
4
  hooks:
5
  - id: check-yaml
6
  - id: end-of-file-fixer
7
  - id: trailing-whitespace
8
 
9
  - repo: https://github.com/astral-sh/ruff-pre-commit
10
- rev: v0.0.291
11
  hooks:
12
  - id: ruff
13
  args: [ --fix ]
14
 
15
  - repo: https://github.com/psf/black
16
- rev: 23.9.1
17
  hooks:
18
  - id: black
19
 
20
  - repo: https://github.com/codespell-project/codespell
21
- rev: v2.2.5
22
  hooks:
23
  - id: codespell
24
  files: ^.*\.(py|md|rst|yml)$
 
1
  repos:
2
  - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.5.0
4
  hooks:
5
  - id: check-yaml
6
  - id: end-of-file-fixer
7
  - id: trailing-whitespace
8
 
9
  - repo: https://github.com/astral-sh/ruff-pre-commit
10
+ rev: v0.1.7
11
  hooks:
12
  - id: ruff
13
  args: [ --fix ]
14
 
15
  - repo: https://github.com/psf/black
16
+ rev: 23.11.0
17
  hooks:
18
  - id: black
19
 
20
  - repo: https://github.com/codespell-project/codespell
21
+ rev: v2.2.6
22
  hooks:
23
  - id: codespell
24
  files: ^.*\.(py|md|rst|yml)$
Data/BangDreamV22/configs/config.json ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 3000,
5
+ "seed": 42,
6
+ "epochs": 1000,
7
+ "learning_rate": 0.0002,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 10,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.99995,
16
+ "segment_size": 16384,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "skip_optimizer": true,
22
+ "freeze_ZH_bert": false,
23
+ "freeze_JP_bert": false,
24
+ "freeze_EN_bert": false
25
+ },
26
+ "data": {
27
+ "training_files": "Data/BangDream/filelists/train.list",
28
+ "validation_files": "Data/BangDream/filelists/val.list",
29
+ "max_wav_value": 32768.0,
30
+ "sampling_rate": 44100,
31
+ "filter_length": 2048,
32
+ "hop_length": 512,
33
+ "win_length": 2048,
34
+ "n_mel_channels": 128,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": null,
37
+ "add_blank": true,
38
+ "n_speakers": 99,
39
+ "cleaned_text": true,
40
+ "spk2id": {
41
+ "香澄": 0,
42
+ "有咲": 1,
43
+ "沙綾": 2,
44
+ "りみ": 3,
45
+ "たえ": 4,
46
+ "沙綾、りみ、たえ": 5,
47
+ "三月七1": 6,
48
+ "紗夜": 7,
49
+ "ロック": 8,
50
+ "パレオ": 9,
51
+ "レイヤ": 10,
52
+ "チュチュ": 11,
53
+ "彩": 12,
54
+ "千聖": 13,
55
+ "イヴ": 14,
56
+ "日菜": 15,
57
+ "麻弥": 16,
58
+ "蘭": 17,
59
+ "モカ": 18,
60
+ "巴": 19,
61
+ "ひまり": 20,
62
+ "つぐみ": 21,
63
+ "はぐみ": 22,
64
+ "花音": 23,
65
+ "美咲": 24,
66
+ "薫": 25,
67
+ "こころ": 26,
68
+ "つくし": 27,
69
+ "七深": 28,
70
+ "透子": 29,
71
+ "ましろ": 30,
72
+ "瑠唯": 31,
73
+ "友希那": 32,
74
+ "あこ": 33,
75
+ "リサ": 34,
76
+ "燐子": 35,
77
+ "燈": 36,
78
+ "愛音": 37,
79
+ "楽奈": 38,
80
+ "そよ": 39,
81
+ "立希": 40,
82
+ "ますき": 41,
83
+ "祥子": 42,
84
+ "睦": 43,
85
+ "海鈴": 44,
86
+ "にゃむ": 45,
87
+ "初華": 46,
88
+ "華戀": 47,
89
+ "晶": 48,
90
+ "光": 49,
91
+ "未知留": 50,
92
+ "香子": 51,
93
+ "雙葉": 52,
94
+ "真晝": 53,
95
+ "艾露": 54,
96
+ "珠緒": 55,
97
+ "艾露露": 56,
98
+ "純那": 57,
99
+ "克洛迪娜": 58,
100
+ "真矢": 59,
101
+ "奈奈": 60,
102
+ "壘": 61,
103
+ "文": 62,
104
+ "一愛": 63,
105
+ "菈樂菲": 64,
106
+ "司": 65,
107
+ "美空": 66,
108
+ "靜羽": 67,
109
+ "悠悠子": 68,
110
+ "八千代": 69,
111
+ "栞": 70,
112
+ "美帆": 71,
113
+ "芙蘿菈": 72,
114
+ "克蕾兒": 73,
115
+ "安德露": 74,
116
+ "瑪莉亞貝菈": 75,
117
+ "克拉迪亞": 76,
118
+ "桃樂西": 77,
119
+ "瑪麗安": 78,
120
+ "八重神子1": 79,
121
+ "娜塔莎": 80,
122
+ "宵宫": 81,
123
+ "派蒙11": 82,
124
+ "派蒙13": 83,
125
+ "派蒙3": 84,
126
+ "派蒙7": 85,
127
+ "派蒙8": 86,
128
+ "派蒙9": 87,
129
+ "派蒙10": 88,
130
+ "派蒙6": 89,
131
+ "派蒙4": 90,
132
+ "派蒙1": 91,
133
+ "派蒙2": 92,
134
+ "派蒙15": 93,
135
+ "派蒙16": 94,
136
+ "派蒙14": 95,
137
+ "派蒙12": 96,
138
+ "派蒙5": 97,
139
+ "纳西妲1": 98
140
+ }
141
+ },
142
+ "model": {
143
+ "use_spk_conditioned_encoder": true,
144
+ "use_noise_scaled_mas": true,
145
+ "use_mel_posterior_encoder": false,
146
+ "use_duration_discriminator": true,
147
+ "inter_channels": 192,
148
+ "hidden_channels": 192,
149
+ "filter_channels": 768,
150
+ "n_heads": 2,
151
+ "n_layers": 6,
152
+ "kernel_size": 3,
153
+ "p_dropout": 0.1,
154
+ "resblock": "1",
155
+ "resblock_kernel_sizes": [
156
+ 3,
157
+ 7,
158
+ 11
159
+ ],
160
+ "resblock_dilation_sizes": [
161
+ [
162
+ 1,
163
+ 3,
164
+ 5
165
+ ],
166
+ [
167
+ 1,
168
+ 3,
169
+ 5
170
+ ],
171
+ [
172
+ 1,
173
+ 3,
174
+ 5
175
+ ]
176
+ ],
177
+ "upsample_rates": [
178
+ 8,
179
+ 8,
180
+ 2,
181
+ 2,
182
+ 2
183
+ ],
184
+ "upsample_initial_channel": 512,
185
+ "upsample_kernel_sizes": [
186
+ 16,
187
+ 16,
188
+ 8,
189
+ 2,
190
+ 2
191
+ ],
192
+ "n_layers_q": 3,
193
+ "use_spectral_norm": false,
194
+ "gin_channels": 256
195
+ },
196
+ "version": "2.2"
197
+ }
Data/BangDreamV22/models/G_51000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:521be4508c8b8b81e81201372cce0ac09cef35ca0f66b3d981f1689a601db3c5
3
+ size 750066550
README.md CHANGED
@@ -1,13 +1,42 @@
1
- ---
2
- title: Bushiroad BertVIts2 Emotional
3
- emoji: 📚
4
- colorFrom: purple
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 4.7.1
8
- app_file: app.py
9
- pinned: false
10
- license: openrail
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <img alt="LOGO" src="https://cdn.jsdelivr.net/gh/fishaudio/fish-diffusion@main/images/logo_512x512.png" width="256" height="256" />
4
+
5
+ # Bert-VITS2
6
+
7
+ VITS2 Backbone with multilingual bert
8
+
9
+ For quick guide, please refer to `webui_preprocess.py`.
10
+
11
+ 简易教程请参见 `webui_preprocess.py`。
12
+
13
+ ## 请注意,本项目核心思路来源于[anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 一个非常好的tts项目
14
+ ## MassTTS的演示demo为[ai版峰哥锐评峰哥本人,并找回了在金三角失落的腰子](https://www.bilibili.com/video/BV1w24y1c7z9)
15
+
16
+ [//]: # (## 本项目与[PlayVoice/vits_chinese]&#40;https://github.com/PlayVoice/vits_chinese&#41; 没有任何关系)
17
+
18
+ [//]: # ()
19
+ [//]: # (本仓库来源于之前朋友分享了ai峰哥的视频,本人被其中的效果惊艳,在自己尝试MassTTS以后发现fs在音质方面与vits有一定差距,并且training的pipeline比vits更复杂,因此按照其思路将bert)
20
+
21
+ ## 成熟的旅行者/开拓者/舰长/博士/sensei/猎魔人/喵喵露/V应当参阅代码自己学习如何训练。
22
+
23
+ ### 严禁将此项目用于一切违反《中华人民共和国宪法》,《中华人民共和国刑法》,《中华人民共和国治安管理处罚法》和《中华人民共和国民法典》之用途。
24
+ ### 严禁用于任何政治相关用途。
25
+ #### Video:https://www.bilibili.com/video/BV1hp4y1K78E
26
+ #### Demo:https://www.bilibili.com/video/BV1TF411k78w
27
+ #### QQ Group:815818430
28
+ ## References
29
+ + [anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS)
30
+ + [jaywalnut310/vits](https://github.com/jaywalnut310/vits)
31
+ + [p0p4k/vits2_pytorch](https://github.com/p0p4k/vits2_pytorch)
32
+ + [svc-develop-team/so-vits-svc](https://github.com/svc-develop-team/so-vits-svc)
33
+ + [PaddlePaddle/PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech)
34
+ + [emotional-vits](https://github.com/innnky/emotional-vits)
35
+ + [Bert-VITS2-en](https://github.com/xwan07017/Bert-VITS2-en)
36
+ + [Bert-VITS2-UI](https://github.com/jiangyuxiaoxiao/Bert-VITS2-UI)
37
+ ## 感谢所有贡献者作出的努力
38
+ <a href="https://github.com/fishaudio/Bert-VITS2/graphs/contributors" target="_blank">
39
+ <img src="https://contrib.rocks/image?repo=fishaudio/Bert-VITS2"/>
40
+ </a>
41
+
42
+ [//]: # (# 本项目所有代码引用均已写明,bert部分代码思路来源于[AI峰哥]&#40;https://www.bilibili.com/video/BV1w24y1c7z9&#41;,与[vits_chinese]&#40;https://github.com/PlayVoice/vits_chinese&#41;无任何关系。欢迎各位查阅代码。同时,我们也对该开发者的[碰瓷,乃至开盒开发者的行为]&#40;https://www.bilibili.com/read/cv27101514/&#41;表示强烈谴责。)
app.py CHANGED
@@ -23,11 +23,8 @@ import torch.nn as nn
23
  from torch.utils.data import Dataset
24
  from torch.utils.data import DataLoader, Dataset
25
  from tqdm import tqdm
26
- from transformers import Wav2Vec2Processor
27
- from transformers.models.wav2vec2.modeling_wav2vec2 import (
28
- Wav2Vec2Model,
29
- Wav2Vec2PreTrainedModel,
30
- )
31
 
32
  import gradio as gr
33
 
@@ -37,7 +34,6 @@ from config import config
37
  import torch
38
  import commons
39
  from text import cleaned_text_to_sequence, get_bert
40
- from emo_gen import process_func, EmotionModel, Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2PreTrainedModel, RegressionHead
41
  from text.cleaner import clean_text
42
  import utils
43
 
@@ -46,7 +42,7 @@ from text.symbols import symbols
46
  import sys
47
 
48
  net_g = None
49
-
50
  device = (
51
  "cuda:0"
52
  if torch.cuda.is_available()
@@ -56,6 +52,7 @@ device = (
56
  else "cpu"
57
  )
58
  )
 
59
  device = "cpu"
60
  BandList = {
61
  "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
@@ -73,7 +70,7 @@ BandList = {
73
  "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
74
  }
75
 
76
- def get_net_g(model_path: str, version: str, device: str, hps):
77
  net_g = SynthesizerTrn(
78
  len(symbols),
79
  hps.data.filter_length // 2 + 1,
@@ -125,27 +122,6 @@ def get_text(text, language_str, hps, device):
125
  language = torch.LongTensor(language)
126
  return bert, ja_bert, en_bert, phone, tone, language
127
 
128
- def get_emo_(reference_audio, emotion):
129
-
130
- if (emotion == 10 and reference_audio):
131
- emo = torch.from_numpy(get_emo(reference_audio))
132
- else:
133
- emo = torch.Tensor([emotion])
134
-
135
- return emo
136
-
137
- def get_emo(path):
138
- wav, sr = librosa.load(path, 16000)
139
- device = config.bert_gen_config.device
140
- return process_func(
141
- np.expand_dims(wav, 0).astype(np.float64),
142
- sr,
143
- emotional_model,
144
- emotional_processor,
145
- device,
146
- embeddings=True,
147
- ).squeeze(0)
148
-
149
  def infer(
150
  text,
151
  sdp_ratio,
@@ -154,16 +130,18 @@ def infer(
154
  length_scale,
155
  sid,
156
  reference_audio=None,
157
- emotion=None,
158
  ):
159
 
160
  language= 'JP' if is_japanese(text) else 'ZH'
161
- print(language)
 
 
 
 
162
  bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
163
  text, language, hps, device
164
  )
165
- emo = get_emo_(reference_audio, emotion)
166
- print(emo)
167
  with torch.no_grad():
168
  x_tst = phones.to(device).unsqueeze(0)
169
  tones = tones.to(device).unsqueeze(0)
@@ -212,18 +190,14 @@ def loadmodel(model):
212
  return "success"
213
 
214
  if __name__ == "__main__":
215
- emotional_model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim"
216
- REPO_ID = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
217
- emotional_processor = Wav2Vec2Processor.from_pretrained(emotional_model_name)
218
- emotional_model = EmotionModel.from_pretrained(emotional_model_name).to(device)
219
  languages = [ "Auto", "ZH", "JP"]
220
  modelPaths = []
221
- for dirpath, dirnames, filenames in os.walk('Data/Bushiroad/models/'):
222
  for filename in filenames:
223
  modelPaths.append(os.path.join(dirpath, filename))
224
- hps = utils.get_hparams_from_file('Data/Bushiroad/configs/config.json')
225
  net_g = get_net_g(
226
- model_path=modelPaths[-1], version="2.1", device=device, hps=hps
227
  )
228
  speaker_ids = hps.data.spk2id
229
  speakers = list(speaker_ids.keys())
@@ -247,9 +221,12 @@ if __name__ == "__main__":
247
  length_scale = gr.Slider(
248
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
249
  )
250
- emotion = gr.Slider(
251
- minimum=0, maximum=10, value=10, step=1, label="情感控制参数,调至10开启情感参考(建议开启,否则声线压不住),如不启动则设为0"
252
- )
 
 
 
253
  with gr.Accordion(label="参数设定", open=False):
254
  sdp_ratio = gr.Slider(
255
  minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
 
23
  from torch.utils.data import Dataset
24
  from torch.utils.data import DataLoader, Dataset
25
  from tqdm import tqdm
26
+ from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
27
+
 
 
 
28
 
29
  import gradio as gr
30
 
 
34
  import torch
35
  import commons
36
  from text import cleaned_text_to_sequence, get_bert
 
37
  from text.cleaner import clean_text
38
  import utils
39
 
 
42
  import sys
43
 
44
  net_g = None
45
+ '''
46
  device = (
47
  "cuda:0"
48
  if torch.cuda.is_available()
 
52
  else "cpu"
53
  )
54
  )
55
+ '''
56
  device = "cpu"
57
  BandList = {
58
  "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
 
70
  "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
71
  }
72
 
73
+ def get_net_g(model_path: str, device: str, hps):
74
  net_g = SynthesizerTrn(
75
  len(symbols),
76
  hps.data.filter_length // 2 + 1,
 
122
  language = torch.LongTensor(language)
123
  return bert, ja_bert, en_bert, phone, tone, language
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def infer(
126
  text,
127
  sdp_ratio,
 
130
  length_scale,
131
  sid,
132
  reference_audio=None,
133
+ emotion='Happy',
134
  ):
135
 
136
  language= 'JP' if is_japanese(text) else 'ZH'
137
+ if isinstance(reference_audio, np.ndarray):
138
+ emo = get_clap_audio_feature(reference_audio, device)
139
+ else:
140
+ emo = get_clap_text_feature(emotion, device)
141
+ emo = torch.squeeze(emo, dim=1)
142
  bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
143
  text, language, hps, device
144
  )
 
 
145
  with torch.no_grad():
146
  x_tst = phones.to(device).unsqueeze(0)
147
  tones = tones.to(device).unsqueeze(0)
 
190
  return "success"
191
 
192
  if __name__ == "__main__":
 
 
 
 
193
  languages = [ "Auto", "ZH", "JP"]
194
  modelPaths = []
195
+ for dirpath, dirnames, filenames in os.walk('Data/BangDreamV22/models/'):
196
  for filename in filenames:
197
  modelPaths.append(os.path.join(dirpath, filename))
198
+ hps = utils.get_hparams_from_file('Data/BangDreamV22/configs/config.json')
199
  net_g = get_net_g(
200
+ model_path=modelPaths[-1], device=device, hps=hps
201
  )
202
  speaker_ids = hps.data.spk2id
203
  speakers = list(speaker_ids.keys())
 
221
  length_scale = gr.Slider(
222
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
223
  )
224
+ emotion = gr.Textbox(
225
+ label="Text prompt",
226
+ placeholder="用文字描述生成风格。如:Happy",
227
+ value="Happy",
228
+ visible=True,
229
+ )
230
  with gr.Accordion(label="参数设定", open=False):
231
  sdp_ratio = gr.Slider(
232
  minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
bert_gen.py CHANGED
@@ -1,12 +1,14 @@
 
 
 
1
  import torch
2
- from multiprocessing import Pool
 
 
3
  import commons
4
  import utils
5
- from tqdm import tqdm
6
- from text import cleaned_text_to_sequence, get_bert
7
- import argparse
8
- import torch.multiprocessing as mp
9
  from config import config
 
10
 
11
 
12
  def process_line(line):
@@ -64,7 +66,7 @@ if __name__ == "__main__":
64
  with open(hps.data.validation_files, encoding="utf-8") as f:
65
  lines.extend(f.readlines())
66
  if len(lines) != 0:
67
- num_processes = args.num_processes
68
  with Pool(processes=num_processes) as pool:
69
  for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
70
  pass
 
1
+ import argparse
2
+ from multiprocessing import Pool, cpu_count
3
+
4
  import torch
5
+ import torch.multiprocessing as mp
6
+ from tqdm import tqdm
7
+
8
  import commons
9
  import utils
 
 
 
 
10
  from config import config
11
+ from text import cleaned_text_to_sequence, get_bert
12
 
13
 
14
  def process_line(line):
 
66
  with open(hps.data.validation_files, encoding="utf-8") as f:
67
  lines.extend(f.readlines())
68
  if len(lines) != 0:
69
+ num_processes = min(args.num_processes, cpu_count())
70
  with Pool(processes=num_processes) as pool:
71
  for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
72
  pass
clap_gen.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from multiprocessing import Pool, cpu_count
3
+
4
+ import torch
5
+ import torch.multiprocessing as mp
6
+ from tqdm import tqdm
7
+
8
+ import utils
9
+ from config import config
10
+ from clap_wrapper import get_clap_audio_feature
11
+ import librosa
12
+ import os
13
+
14
+ os.environ["OMP_NUM_THREADS"] = "1"
15
+ os.environ["MKL_NUM_THREADS"] = "1"
16
+
17
+
18
+ def process_line(line):
19
+ device = config.emo_gen_config.device
20
+ if config.emo_gen_config.use_multi_device:
21
+ rank = mp.current_process()._identity
22
+ rank = rank[0] if len(rank) > 0 else 0
23
+ if torch.cuda.is_available():
24
+ gpu_id = rank % torch.cuda.device_count()
25
+ device = torch.device(f"cuda:{gpu_id}")
26
+ else:
27
+ device = torch.device("cpu")
28
+ wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
29
+
30
+ clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy")
31
+ if os.path.isfile(clap_path):
32
+ return
33
+
34
+ audio = librosa.load(wav_path, 48000)[0]
35
+ # audio = librosa.resample(audio, 44100, 48000)
36
+
37
+ clap = get_clap_audio_feature(audio, device)
38
+ torch.save(clap, clap_path)
39
+
40
+
41
+ if __name__ == "__main__":
42
+ parser = argparse.ArgumentParser()
43
+ parser.add_argument(
44
+ "-c", "--config", type=str, default=config.emo_gen_config.config_path
45
+ )
46
+ parser.add_argument(
47
+ "--num_processes", type=int, default=config.emo_gen_config.num_processes
48
+ )
49
+ args, _ = parser.parse_known_args()
50
+ config_path = args.config
51
+ hps = utils.get_hparams_from_file(config_path)
52
+ lines = []
53
+ with open(hps.data.training_files, encoding="utf-8") as f:
54
+ lines.extend(f.readlines())
55
+
56
+ with open(hps.data.validation_files, encoding="utf-8") as f:
57
+ lines.extend(f.readlines())
58
+ if len(lines) != 0:
59
+ num_processes = min(args.num_processes, cpu_count())
60
+ with Pool(processes=num_processes) as pool:
61
+ for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
62
+ pass
63
+
64
+ print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!")
clap_wrapper.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import ClapModel, ClapProcessor
5
+
6
+ from config import config
7
+
8
+ models = dict()
9
+ processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused")
10
+
11
+
12
+ def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device):
13
+ if (
14
+ sys.platform == "darwin"
15
+ and torch.backends.mps.is_available()
16
+ and device == "cpu"
17
+ ):
18
+ device = "mps"
19
+ if not device:
20
+ device = "cuda"
21
+ if device not in models.keys():
22
+ models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
23
+ device
24
+ )
25
+ with torch.no_grad():
26
+ inputs = processor(
27
+ audios=audio_data, return_tensors="pt", sampling_rate=48000
28
+ ).to(device)
29
+ emb = models[device].get_audio_features(**inputs)
30
+ return emb.T
31
+
32
+
33
+ def get_clap_text_feature(text, device=config.bert_gen_config.device):
34
+ if (
35
+ sys.platform == "darwin"
36
+ and torch.backends.mps.is_available()
37
+ and device == "cpu"
38
+ ):
39
+ device = "mps"
40
+ if not device:
41
+ device = "cuda"
42
+ if device not in models.keys():
43
+ models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
44
+ device
45
+ )
46
+ with torch.no_grad():
47
+ inputs = processor(text=text, return_tensors="pt").to(device)
48
+ emb = models[device].get_text_features(**inputs)
49
+ return emb.T
commons.py CHANGED
@@ -46,26 +46,18 @@ def rand_gumbel_like(x):
46
 
47
 
48
  def slice_segments(x, ids_str, segment_size=4):
49
- ret = torch.zeros_like(x[:, :, :segment_size])
50
- for i in range(x.size(0)):
51
- idx_str = ids_str[i]
52
- idx_end = idx_str + segment_size
53
- if idx_str < 0:
54
- i1 = x.size(2) + idx_str
55
- r1 = x[i, :, i1:]
56
- r2 = x[i, :, :idx_end]
57
- ret[i] = torch.cat([r1, r2], dim=1)
58
- else:
59
- ret[i] = x[i, :, idx_str:idx_end]
60
- return ret
61
 
62
 
63
  def rand_slice_segments(x, x_lengths=None, segment_size=4):
64
  b, d, t = x.size()
65
  if x_lengths is None:
66
  x_lengths = t
67
- ids_str_max = x_lengths - segment_size + 1
68
- ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
69
  ret = slice_segments(x, ids_str, segment_size)
70
  return ret, ids_str
71
 
 
46
 
47
 
48
  def slice_segments(x, ids_str, segment_size=4):
49
+ gather_indices = ids_str.view(x.size(0), 1, 1).repeat(
50
+ 1, x.size(1), 1
51
+ ) + torch.arange(segment_size, device=x.device)
52
+ return torch.gather(x, 2, gather_indices)
 
 
 
 
 
 
 
 
53
 
54
 
55
  def rand_slice_segments(x, x_lengths=None, segment_size=4):
56
  b, d, t = x.size()
57
  if x_lengths is None:
58
  x_lengths = t
59
+ ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
60
+ ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
61
  ret = slice_segments(x, ids_str, segment_size)
62
  return ret, ids_str
63
 
compress_model.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from text.symbols import symbols
3
+ import torch
4
+
5
+ from tools.log import logger
6
+ import utils
7
+ from models import SynthesizerTrn
8
+ import os
9
+
10
+
11
+ def copyStateDict(state_dict):
12
+ if list(state_dict.keys())[0].startswith("module"):
13
+ start_idx = 1
14
+ else:
15
+ start_idx = 0
16
+ new_state_dict = OrderedDict()
17
+ for k, v in state_dict.items():
18
+ name = ",".join(k.split(".")[start_idx:])
19
+ new_state_dict[name] = v
20
+ return new_state_dict
21
+
22
+
23
+ def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str):
24
+ hps = utils.get_hparams_from_file(config)
25
+
26
+ net_g = SynthesizerTrn(
27
+ len(symbols),
28
+ hps.data.filter_length // 2 + 1,
29
+ hps.train.segment_size // hps.data.hop_length,
30
+ n_speakers=hps.data.n_speakers,
31
+ **hps.model,
32
+ )
33
+
34
+ optim_g = torch.optim.AdamW(
35
+ net_g.parameters(),
36
+ hps.train.learning_rate,
37
+ betas=hps.train.betas,
38
+ eps=hps.train.eps,
39
+ )
40
+
41
+ state_dict_g = torch.load(input_model, map_location="cpu")
42
+ new_dict_g = copyStateDict(state_dict_g)
43
+ keys = []
44
+ for k, v in new_dict_g["model"].items():
45
+ if "enc_q" in k:
46
+ continue # noqa: E701
47
+ keys.append(k)
48
+
49
+ new_dict_g = (
50
+ {k: new_dict_g["model"][k].half() for k in keys}
51
+ if ishalf
52
+ else {k: new_dict_g["model"][k] for k in keys}
53
+ )
54
+
55
+ torch.save(
56
+ {
57
+ "model": new_dict_g,
58
+ "iteration": 0,
59
+ "optimizer": optim_g.state_dict(),
60
+ "learning_rate": 0.0001,
61
+ },
62
+ output_model,
63
+ )
64
+
65
+
66
+ if __name__ == "__main__":
67
+ import argparse
68
+
69
+ parser = argparse.ArgumentParser()
70
+ parser.add_argument("-c", "--config", type=str, default="configs/config.json")
71
+ parser.add_argument("-i", "--input", type=str)
72
+ parser.add_argument("-o", "--output", type=str, default=None)
73
+ parser.add_argument(
74
+ "-hf", "--half", action="store_true", default=False, help="Save as FP16"
75
+ )
76
+
77
+ args = parser.parse_args()
78
+
79
+ output = args.output
80
+
81
+ if output is None:
82
+ import os.path
83
+
84
+ filename, ext = os.path.splitext(args.input)
85
+ half = "_half" if args.half else ""
86
+ output = filename + "_release" + half + ext
87
+
88
+ removeOptimizer(args.config, args.input, args.half, output)
89
+ logger.info(f"压缩模型成功, 输出模型: {os.path.abspath(output)}")
config.py CHANGED
@@ -38,7 +38,7 @@ class Preprocess_text_config:
38
  train_path: str,
39
  val_path: str,
40
  config_path: str,
41
- val_per_spk: int = 5,
42
  max_val_total: int = 10000,
43
  clean: bool = True,
44
  ):
@@ -47,7 +47,7 @@ class Preprocess_text_config:
47
  self.train_path: str = train_path # 训练集路径,可以不填。不填则将在原始文本目录生成
48
  self.val_path: str = val_path # 验证集路径,可以不填。不填则将在原始文本目录生成
49
  self.config_path: str = config_path # 配置文件路径
50
- self.val_per_spk: int = val_per_spk # 每个speaker的验证集条数
51
  self.max_val_total: int = max_val_total # 验证集最大条数,多于的会被截断并放到训练集中
52
  self.clean: bool = clean # 是否进行数据清洗
53
 
@@ -99,10 +99,12 @@ class Emo_gen_config:
99
  config_path: str,
100
  num_processes: int = 2,
101
  device: str = "cuda",
 
102
  ):
103
  self.config_path = config_path
104
  self.num_processes = num_processes
105
  self.device = device
 
106
 
107
  @classmethod
108
  def from_dict(cls, dataset_path: str, data: Dict[str, any]):
@@ -222,6 +224,9 @@ class Config:
222
  self.bert_gen_config: Bert_gen_config = Bert_gen_config.from_dict(
223
  dataset_path, yaml_config["bert_gen"]
224
  )
 
 
 
225
  self.train_ms_config: Train_ms_config = Train_ms_config.from_dict(
226
  dataset_path, yaml_config["train_ms"]
227
  )
 
38
  train_path: str,
39
  val_path: str,
40
  config_path: str,
41
+ val_per_lang: int = 5,
42
  max_val_total: int = 10000,
43
  clean: bool = True,
44
  ):
 
47
  self.train_path: str = train_path # 训练集路径,可以不填。不填则将在原始文本目录生成
48
  self.val_path: str = val_path # 验证集路径,可以不填。不填则将在原始文本目录生成
49
  self.config_path: str = config_path # 配置文件路径
50
+ self.val_per_lang: int = val_per_lang # 每个speaker的验证集条数
51
  self.max_val_total: int = max_val_total # 验证集最大条数,多于的会被截断并放到训练集中
52
  self.clean: bool = clean # 是否进行数据清洗
53
 
 
99
  config_path: str,
100
  num_processes: int = 2,
101
  device: str = "cuda",
102
+ use_multi_device: bool = False,
103
  ):
104
  self.config_path = config_path
105
  self.num_processes = num_processes
106
  self.device = device
107
+ self.use_multi_device = use_multi_device
108
 
109
  @classmethod
110
  def from_dict(cls, dataset_path: str, data: Dict[str, any]):
 
224
  self.bert_gen_config: Bert_gen_config = Bert_gen_config.from_dict(
225
  dataset_path, yaml_config["bert_gen"]
226
  )
227
+ self.emo_gen_config: Emo_gen_config = Emo_gen_config.from_dict(
228
+ dataset_path, yaml_config["emo_gen"]
229
+ )
230
  self.train_ms_config: Train_ms_config = Train_ms_config.from_dict(
231
  dataset_path, yaml_config["train_ms"]
232
  )
config.yml CHANGED
@@ -4,10 +4,10 @@
4
  # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
  # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
  # 不填或者填空则路径为相对于项目根目录的路径
7
- dataset_path: "Data/BanGDream"
8
 
9
  # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
- mirror: "openai"
11
  openi_token: "" # openi token
12
 
13
  # resample 音频重采样配置
@@ -17,16 +17,16 @@ resample:
17
  sampling_rate: 44100
18
  # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样
19
  # 请填入相对于datasetPath的相对路径
20
- in_dir: "" # 相对于根目录的路径为 /datasetPath/in_dir
21
  # 音频文件重采样后输出路径
22
- out_dir: ""
23
 
24
 
25
  # preprocess_text 数据集预处理相关配置
26
  # 注意, “:” 后需要加空格
27
  preprocess_text:
28
  # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
- transcription_path: "filelists/Mygo.list"
30
  # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
  cleaned_path: ""
32
  # 训练集路径
@@ -34,11 +34,11 @@ preprocess_text:
34
  # 验证集路径
35
  val_path: "filelists/val.list"
36
  # 配置文件路径
37
- config_path: "configs/config.json"
38
- # 每个speaker的验证集条数
39
- val_per_spk: 4
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
- max_val_total: 8
42
  # 是否进行数据清洗
43
  clean: true
44
 
@@ -47,9 +47,9 @@ preprocess_text:
47
  # 注意, “:” 后需要加空格
48
  bert_gen:
49
  # 训练数据集配置文件路径
50
- config_path: "configs/config.json"
51
  # 并行数
52
- num_processes: 2
53
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
  # 该选项同时决定了get_bert_feature的默认设备
55
  device: "cuda"
@@ -60,11 +60,13 @@ bert_gen:
60
  # 注意, “:” 后需要加空格
61
  emo_gen:
62
  # 训练数据集配置文件路径
63
- config_path: "configs/config.json"
64
  # 并行数
65
- num_processes: 2
66
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
67
  device: "cuda"
 
 
68
 
69
  # train 训练配置
70
  # 注意, “:” 后需要加空格
@@ -79,13 +81,13 @@ train_ms:
79
  # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
80
  # 底模设置
81
  base:
82
- use_base_model: True
83
  repo_id: "Stardust_minus/Bert-VITS2"
84
- model_image: "Bert-VITS2_2.1-Emo底模" # openi网页的模型名
85
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
86
  model: "models"
87
  # 配置文件路径
88
- config_path: "configs/config.json"
89
  # 训练使用的worker,不建议超过CPU核心数
90
  num_workers: 16
91
  # 关闭此项可以节约接近50%的磁盘空间,但是可能导致实际训练速度变慢和更高的CPU使用率。
@@ -98,11 +100,11 @@ train_ms:
98
  # 注意, “:” 后需要加空格
99
  webui:
100
  # 推理设备
101
- device: "cpu"
102
  # 模型路径
103
- model: "models/G_30000.pth"
104
  # 配置文件路径
105
- config_path: "configs/config.json"
106
  # 端口号
107
  port: 7860
108
  # 是否公开部署,对外网开放
@@ -113,7 +115,7 @@ webui:
113
  language_identification_library: "langid"
114
 
115
 
116
- # server api配置
117
  # 注意, “:” 后需要加空格
118
  # 注意,本配置下的所有配置均为相对于根目录的路径
119
  server:
@@ -121,8 +123,10 @@ server:
121
  port: 5000
122
  # 模型默认使用设备:但是当前并没有实现这个配置。
123
  device: "cuda"
124
- # 需要加载的所有模型的配置
 
125
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
 
126
  models:
127
  - # 模型的路径
128
  model: ""
@@ -163,7 +167,6 @@ server:
163
  # 不必填写所有人物,不填的使用默认值
164
  speakers: [ ] # 也可以不填
165
 
166
-
167
  # 百度翻译开放平台 api配置
168
  # api接入文档 https://api.fanyi.baidu.com/doc/21
169
  # 请不要在github等网站公开分享你的app id 与 key
 
4
  # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
  # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
  # 不填或者填空则路径为相对于项目根目录的路径
7
+ dataset_path: "Data/"
8
 
9
  # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
+ mirror: ""
11
  openi_token: "" # openi token
12
 
13
  # resample 音频重采样配置
 
17
  sampling_rate: 44100
18
  # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样
19
  # 请填入相对于datasetPath的相对路径
20
+ in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
21
  # 音频文件重采样后输出路径
22
+ out_dir: "audios/wavs"
23
 
24
 
25
  # preprocess_text 数据集预处理相关配置
26
  # 注意, “:” 后需要加空格
27
  preprocess_text:
28
  # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
+ transcription_path: "filelists/你的数据集文本.list"
30
  # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
  cleaned_path: ""
32
  # 训练集路径
 
34
  # 验证集路径
35
  val_path: "filelists/val.list"
36
  # 配置文件路径
37
+ config_path: "config.json"
38
+ # 每个语言的验证集条数
39
+ val_per_lang: 4
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
+ max_val_total: 12
42
  # 是否进行数据清洗
43
  clean: true
44
 
 
47
  # 注意, “:” 后需要加空格
48
  bert_gen:
49
  # 训练数据集配置文件路径
50
+ config_path: "config.json"
51
  # 并行数
52
+ num_processes: 4
53
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
  # 该选项同时决定了get_bert_feature的默认设备
55
  device: "cuda"
 
60
  # 注意, “:” 后需要加空格
61
  emo_gen:
62
  # 训练数据集配置文件路径
63
+ config_path: "config.json"
64
  # 并行数
65
+ num_processes: 4
66
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
67
  device: "cuda"
68
+ # 使用多卡推理
69
+ use_multi_device: false
70
 
71
  # train 训练配置
72
  # 注意, “:” 后需要加空格
 
81
  # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
82
  # 底模设置
83
  base:
84
+ use_base_model: false
85
  repo_id: "Stardust_minus/Bert-VITS2"
86
+ model_image: "Bert-VITS2_2.2-Clap底模" # openi网页的模型名
87
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
88
  model: "models"
89
  # 配置文件路径
90
+ config_path: "config.json"
91
  # 训练使用的worker,不建议超过CPU核心数
92
  num_workers: 16
93
  # 关闭此项可以节约接近50%的磁盘空间,但是可能导致实际训练速度变慢和更高的CPU使用率。
 
100
  # 注意, “:” 后需要加空格
101
  webui:
102
  # 推理设备
103
+ device: "cuda"
104
  # 模型路径
105
+ model: "models/G_8000.pth"
106
  # 配置文件路径
107
+ config_path: "config.json"
108
  # 端口号
109
  port: 7860
110
  # 是否公开部署,对外网开放
 
115
  language_identification_library: "langid"
116
 
117
 
118
+ # server-fastapi配置
119
  # 注意, “:” 后需要加空格
120
  # 注意,本配置下的所有配置均为相对于根目录的路径
121
  server:
 
123
  port: 5000
124
  # 模型默认使用设备:但是当前并没有实现这个配置。
125
  device: "cuda"
126
+ # 需要加载的所有模型的配置,可以填多个模型,也可以不填模型,等网页成功后手动加载模型
127
+ # 不加载模型的配置格式:删除默认给的两个模型配置,给models赋值 [ ],也就是空列表。参考模型2的speakers 即 models: [ ]
128
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
129
+ # 也可以不填模型,等网页加载成功后手动填写models。
130
  models:
131
  - # 模型的路径
132
  model: ""
 
167
  # 不必填写所有人物,不填的使用默认值
168
  speakers: [ ] # 也可以不填
169
 
 
170
  # 百度翻译开放平台 api配置
171
  # api接入文档 https://api.fanyi.baidu.com/doc/21
172
  # 请不要在github等网站公开分享你的app id 与 key
configs/config.json CHANGED
@@ -10,7 +10,7 @@
10
  0.99
11
  ],
12
  "eps": 1e-09,
13
- "batch_size": 24,
14
  "fp16_run": false,
15
  "lr_decay": 0.99995,
16
  "segment_size": 16384,
@@ -18,7 +18,10 @@
18
  "warmup_epochs": 0,
19
  "c_mel": 45,
20
  "c_kl": 1.0,
21
- "skip_optimizer": true
 
 
 
22
  },
23
  "data": {
24
  "training_files": "filelists/train.list",
@@ -676,220 +679,220 @@
676
  "埃舍尔_EN": 638,
677
  "萨齐因_EN": 639,
678
  "古田_EN": 640,
679
- "陆景和": 641,
680
- "莫弈": 642,
681
- "左然": 643,
682
- "夏彦": 644,
683
- "三月七_ZH": 645,
684
- "丹恒_ZH": 646,
685
- "希儿_ZH": 647,
686
- "娜塔莎_ZH": 648,
687
- "希露瓦_ZH": 649,
688
- "瓦尔特_ZH": 650,
689
- "佩拉_ZH": 651,
690
- "布洛妮娅_ZH": 652,
691
- "虎克_ZH": 653,
692
- "素裳_ZH": 654,
693
- "克拉拉_ZH": 655,
694
- "符玄_ZH": 656,
695
- "白露_ZH": 657,
696
- "杰帕德_ZH": 658,
697
- "景元_ZH": 659,
698
- "藿藿_ZH": 660,
699
- "姬子_ZH": 661,
700
- "_ZH": 662,
701
- "_ZH": 663,
702
- "卡芙卡_ZH": 664,
703
- "桂乃芬_ZH": 665,
704
- "艾丝妲_ZH": 666,
705
- "玲可_ZH": 667,
706
- "彦卿_ZH": 668,
707
- "托帕_ZH": 669,
708
- "驭空_ZH": 670,
709
- "浮烟_ZH": 671,
710
- "停云_ZH": 672,
711
- "镜流_ZH": 673,
712
- "罗刹_ZH": 674,
713
- "卢卡_ZH": 675,
714
- "史瓦罗_ZH": 676,
715
- "黑塔_ZH": 677,
716
- "桑博_ZH": 678,
717
- "伦纳德_ZH": 679,
718
- "明曦_ZH": 680,
719
- "银狼_ZH": 681,
720
- "帕姆_ZH": 682,
721
- "青雀_ZH": 683,
722
- "乔瓦尼_ZH": 684,
723
- "公输师傅_ZH": 685,
724
- "晴霓_ZH": 686,
725
- "螺丝咕姆_ZH": 687,
726
- "阿兰_ZH": 688,
727
- "奥列格_ZH": 689,
728
- "丹枢_ZH": 690,
729
- "尾巴_ZH": 691,
730
- "寒鸦_ZH": 692,
731
- "雪衣_ZH": 693,
732
- "可可利亚_ZH": 694,
733
- "青镞_ZH": 695,
734
- "半夏_ZH": 696,
735
- "银枝_ZH": 697,
736
- "大毫_ZH": 698,
737
- "霄翰_ZH": 699,
738
- "信使_ZH": 700,
739
- "费斯曼_ZH": 701,
740
- "绿芙蓉_ZH": 702,
741
- "dev_成男_ZH": 703,
742
- "金人会长_ZH": 704,
743
- "维利特_ZH": 705,
744
- "维尔德_ZH": 706,
745
- "斯科特_ZH": 707,
746
- "卡波特_ZH": 708,
747
- "刃_ZH": 709,
748
- "岩明_ZH": 710,
749
- "浣溪_ZH": 711,
750
- "三月七_JP": 712,
751
- "丹恒_JP": 713,
752
- "希儿_JP": 714,
753
- "娜塔莎_JP": 715,
754
- "希露瓦_JP": 716,
755
- "瓦尔特_JP": 717,
756
- "佩拉_JP": 718,
757
- "布洛妮娅_JP": 719,
758
- "虎克_JP": 720,
759
- "素裳_JP": 721,
760
- "克拉拉_JP": 722,
761
- "符玄_JP": 723,
762
- "白露_JP": 724,
763
- "杰帕德_JP": 725,
764
- "景元_JP": 726,
765
- "藿藿_JP": 727,
766
- "姬子_JP": 728,
767
- "卡芙卡_JP": 729,
768
- "_JP": 730,
769
- "_JP": 731,
770
- "桂乃芬_JP": 732,
771
- "艾丝妲_JP": 733,
772
- "彦卿_JP": 734,
773
- "玲可_JP": 735,
774
- "托帕_JP": 736,
775
- "驭空_JP": 737,
776
- "浮烟_JP": 738,
777
- "停云_JP": 739,
778
- "镜流_JP": 740,
779
- "罗刹_JP": 741,
780
- "卢卡_JP": 742,
781
- "史瓦罗_JP": 743,
782
- "黑塔_JP": 744,
783
- "桑博_JP": 745,
784
- "伦纳德_JP": 746,
785
- "明曦_JP": 747,
786
- "银狼_JP": 748,
787
- "帕姆_JP": 749,
788
- "青雀_JP": 750,
789
- "乔瓦尼_JP": 751,
790
- "公输师傅_JP": 752,
791
- "晴霓_JP": 753,
792
- "螺丝咕姆_JP": 754,
793
- "阿兰_JP": 755,
794
- "奥列格_JP": 756,
795
- "丹枢_JP": 757,
796
- "尾巴_JP": 758,
797
- "寒鸦_JP": 759,
798
- "雪衣_JP": 760,
799
- "可可利亚_JP": 761,
800
- "青镞_JP": 762,
801
- "半夏_JP": 763,
802
- "银枝_JP": 764,
803
- "大毫_JP": 765,
804
- "霄翰_JP": 766,
805
- "信使_JP": 767,
806
- "费斯曼_JP": 768,
807
- "绿芙蓉_JP": 769,
808
- "dev_成男_JP": 770,
809
- "金人会长_JP": 771,
810
- "维利特_JP": 772,
811
- "维尔德_JP": 773,
812
- "斯科特_JP": 774,
813
- "_JP": 775,
814
- "卡波特_JP": 776,
815
- "岩明_JP": 777,
816
- "浣溪_JP": 778,
817
- "净砚_JP": 779,
818
- "紫月季_JP": 780,
819
- "歌蒂_JP": 781,
820
- "奇怪的云骑_JP": 782,
821
- "幻胧_JP": 783,
822
- "斯薇塔_JP": 784,
823
- "隐书_JP": 785,
824
- "三月七_EN": 786,
825
- "丹恒_EN": 787,
826
- "希儿_EN": 788,
827
- "娜塔莎_EN": 789,
828
- "希露瓦_EN": 790,
829
- "瓦尔特_EN": 791,
830
- "佩拉_EN": 792,
831
- "布洛妮娅_EN": 793,
832
- "虎克_EN": 794,
833
- "素裳_EN": 795,
834
- "克拉拉_EN": 796,
835
- "符玄_EN": 797,
836
- "白露_EN": 798,
837
- "杰帕德_EN": 799,
838
- "景元_EN": 800,
839
- "藿藿_EN": 801,
840
- "姬子_EN": 802,
841
- "卡芙卡_EN": 803,
842
- "_EN": 804,
843
- "_EN": 805,
844
- "桂乃芬_EN": 806,
845
- "艾丝妲_EN": 807,
846
- "彦卿_EN": 808,
847
- "玲可_EN": 809,
848
- "托帕_EN": 810,
849
- "驭空_EN": 811,
850
- "浮烟_EN": 812,
851
- "停云_EN": 813,
852
- "镜流_EN": 814,
853
- "罗刹_EN": 815,
854
- "卢卡_EN": 816,
855
- "史瓦罗_EN": 817,
856
- "黑塔_EN": 818,
857
- "桑博_EN": 819,
858
- "伦纳德_EN": 820,
859
- "明曦_EN": 821,
860
- "银狼_EN": 822,
861
- "帕姆_EN": 823,
862
- "青雀_EN": 824,
863
- "乔瓦尼_EN": 825,
864
- "公输师傅_EN": 826,
865
- "晴霓_EN": 827,
866
- "螺丝咕姆_EN": 828,
867
- "阿兰_EN": 829,
868
- "奥列格_EN": 830,
869
- "丹枢_EN": 831,
870
- "尾巴_EN": 832,
871
- "寒鸦_EN": 833,
872
- "雪衣_EN": 834,
873
- "可可利亚_EN": 835,
874
- "青镞_EN": 836,
875
- "半夏_EN": 837,
876
- "银枝_EN": 838,
877
- "大毫_EN": 839,
878
- "霄翰_EN": 840,
879
- "信使_EN": 841,
880
- "费斯曼_EN": 842,
881
- "绿芙蓉_EN": 843,
882
- "dev_成男_EN": 844,
883
- "金人会长_EN": 845,
884
- "维利特_EN": 846,
885
- "维尔德_EN": 847,
886
- "_EN": 848,
887
- "卡波特_EN": 849,
888
- "岩明_EN": 850,
889
- "浣溪_EN": 851,
890
- "紫月季_EN": 852,
891
- "幻胧_EN": 853,
892
- "女声_EN": 854
893
  }
894
  },
895
  "model": {
@@ -946,5 +949,5 @@
946
  "use_spectral_norm": false,
947
  "gin_channels": 256
948
  },
949
- "version": "2.1"
950
  }
 
10
  0.99
11
  ],
12
  "eps": 1e-09,
13
+ "batch_size": 12,
14
  "fp16_run": false,
15
  "lr_decay": 0.99995,
16
  "segment_size": 16384,
 
18
  "warmup_epochs": 0,
19
  "c_mel": 45,
20
  "c_kl": 1.0,
21
+ "skip_optimizer": true,
22
+ "freeze_ZH_bert": false,
23
+ "freeze_JP_bert": false,
24
+ "freeze_EN_bert": false
25
  },
26
  "data": {
27
  "training_files": "filelists/train.list",
 
679
  "埃舍尔_EN": 638,
680
  "萨齐因_EN": 639,
681
  "古田_EN": 640,
682
+ "三月七_ZH": 641,
683
+ "丹恒_ZH": 642,
684
+ "希儿_ZH": 643,
685
+ "娜塔莎_ZH": 644,
686
+ "希露瓦_ZH": 645,
687
+ "瓦尔特_ZH": 646,
688
+ "佩拉_ZH": 647,
689
+ "布洛妮娅_ZH": 648,
690
+ "虎克_ZH": 649,
691
+ "素裳_ZH": 650,
692
+ "克拉拉_ZH": 651,
693
+ "符玄_ZH": 652,
694
+ "白露_ZH": 653,
695
+ "杰帕德_ZH": 654,
696
+ "景元_ZH": 655,
697
+ "藿藿_ZH": 656,
698
+ "姬子_ZH": 657,
699
+ "_ZH": 658,
700
+ "_ZH": 659,
701
+ "卡芙卡_ZH": 660,
702
+ "桂乃芬_ZH": 661,
703
+ "艾丝妲_ZH": 662,
704
+ "玲可_ZH": 663,
705
+ "彦卿_ZH": 664,
706
+ "托帕_ZH": 665,
707
+ "驭空_ZH": 666,
708
+ "浮烟_ZH": 667,
709
+ "停云_ZH": 668,
710
+ "镜流_ZH": 669,
711
+ "罗刹_ZH": 670,
712
+ "卢卡_ZH": 671,
713
+ "史瓦罗_ZH": 672,
714
+ "黑塔_ZH": 673,
715
+ "桑博_ZH": 674,
716
+ "伦纳德_ZH": 675,
717
+ "明曦_ZH": 676,
718
+ "银狼_ZH": 677,
719
+ "帕姆_ZH": 678,
720
+ "青雀_ZH": 679,
721
+ "乔瓦尼_ZH": 680,
722
+ "公输师傅_ZH": 681,
723
+ "晴霓_ZH": 682,
724
+ "螺丝咕姆_ZH": 683,
725
+ "阿兰_ZH": 684,
726
+ "奥列格_ZH": 685,
727
+ "丹枢_ZH": 686,
728
+ "尾巴_ZH": 687,
729
+ "寒鸦_ZH": 688,
730
+ "雪衣_ZH": 689,
731
+ "可可利亚_ZH": 690,
732
+ "青镞_ZH": 691,
733
+ "半夏_ZH": 692,
734
+ "银枝_ZH": 693,
735
+ "大毫_ZH": 694,
736
+ "霄翰_ZH": 695,
737
+ "信使_ZH": 696,
738
+ "费斯曼_ZH": 697,
739
+ "绿芙蓉_ZH": 698,
740
+ "dev_成男_ZH": 699,
741
+ "金人会长_ZH": 700,
742
+ "维利特_ZH": 701,
743
+ "维尔德_ZH": 702,
744
+ "斯科特_ZH": 703,
745
+ "卡波特_ZH": 704,
746
+ "_ZH": 705,
747
+ "岩明_ZH": 706,
748
+ "浣溪_ZH": 707,
749
+ "三月七_JP": 708,
750
+ "丹恒_JP": 709,
751
+ "希儿_JP": 710,
752
+ "娜塔莎_JP": 711,
753
+ "希露瓦_JP": 712,
754
+ "瓦尔特_JP": 713,
755
+ "佩拉_JP": 714,
756
+ "布洛妮娅_JP": 715,
757
+ "虎克_JP": 716,
758
+ "素裳_JP": 717,
759
+ "克拉拉_JP": 718,
760
+ "符玄_JP": 719,
761
+ "白露_JP": 720,
762
+ "杰帕德_JP": 721,
763
+ "景元_JP": 722,
764
+ "藿藿_JP": 723,
765
+ "姬子_JP": 724,
766
+ "卡芙卡_JP": 725,
767
+ "_JP": 726,
768
+ "_JP": 727,
769
+ "桂乃芬_JP": 728,
770
+ "艾丝妲_JP": 729,
771
+ "彦卿_JP": 730,
772
+ "玲可_JP": 731,
773
+ "托帕_JP": 732,
774
+ "驭空_JP": 733,
775
+ "浮烟_JP": 734,
776
+ "停云_JP": 735,
777
+ "镜流_JP": 736,
778
+ "罗刹_JP": 737,
779
+ "卢卡_JP": 738,
780
+ "史瓦罗_JP": 739,
781
+ "黑塔_JP": 740,
782
+ "桑博_JP": 741,
783
+ "伦纳德_JP": 742,
784
+ "明曦_JP": 743,
785
+ "银狼_JP": 744,
786
+ "帕姆_JP": 745,
787
+ "青雀_JP": 746,
788
+ "乔瓦尼_JP": 747,
789
+ "公输师傅_JP": 748,
790
+ "晴霓_JP": 749,
791
+ "螺丝咕姆_JP": 750,
792
+ "阿兰_JP": 751,
793
+ "奥列格_JP": 752,
794
+ "丹枢_JP": 753,
795
+ "尾巴_JP": 754,
796
+ "寒鸦_JP": 755,
797
+ "雪衣_JP": 756,
798
+ "可可利亚_JP": 757,
799
+ "青镞_JP": 758,
800
+ "半夏_JP": 759,
801
+ "银枝_JP": 760,
802
+ "大毫_JP": 761,
803
+ "霄翰_JP": 762,
804
+ "信使_JP": 763,
805
+ "费斯曼_JP": 764,
806
+ "绿芙蓉_JP": 765,
807
+ "dev_成男_JP": 766,
808
+ "金人会长_JP": 767,
809
+ "维利特_JP": 768,
810
+ "维尔德_JP": 769,
811
+ "斯科特_JP": 770,
812
+ "_JP": 771,
813
+ "卡波特_JP": 772,
814
+ "岩明_JP": 773,
815
+ "浣溪_JP": 774,
816
+ "净砚_JP": 775,
817
+ "紫月季_JP": 776,
818
+ "歌蒂_JP": 777,
819
+ "奇怪的云骑_JP": 778,
820
+ "幻胧_JP": 779,
821
+ "斯薇塔_JP": 780,
822
+ "隐书_JP": 781,
823
+ "三月七_EN": 782,
824
+ "丹恒_EN": 783,
825
+ "希儿_EN": 784,
826
+ "娜塔莎_EN": 785,
827
+ "希露瓦_EN": 786,
828
+ "瓦尔特_EN": 787,
829
+ "佩拉_EN": 788,
830
+ "布洛妮娅_EN": 789,
831
+ "虎克_EN": 790,
832
+ "素裳_EN": 791,
833
+ "克拉拉_EN": 792,
834
+ "符玄_EN": 793,
835
+ "白露_EN": 794,
836
+ "杰帕德_EN": 795,
837
+ "景元_EN": 796,
838
+ "藿藿_EN": 797,
839
+ "姬子_EN": 798,
840
+ "卡芙卡_EN": 799,
841
+ "_EN": 800,
842
+ "_EN": 801,
843
+ "桂乃芬_EN": 802,
844
+ "艾丝妲_EN": 803,
845
+ "彦卿_EN": 804,
846
+ "玲可_EN": 805,
847
+ "托帕_EN": 806,
848
+ "驭空_EN": 807,
849
+ "浮烟_EN": 808,
850
+ "停云_EN": 809,
851
+ "镜流_EN": 810,
852
+ "罗刹_EN": 811,
853
+ "卢卡_EN": 812,
854
+ "史瓦罗_EN": 813,
855
+ "黑塔_EN": 814,
856
+ "桑博_EN": 815,
857
+ "伦纳德_EN": 816,
858
+ "明曦_EN": 817,
859
+ "银狼_EN": 818,
860
+ "帕姆_EN": 819,
861
+ "青雀_EN": 820,
862
+ "乔瓦尼_EN": 821,
863
+ "公输师傅_EN": 822,
864
+ "晴霓_EN": 823,
865
+ "螺丝咕姆_EN": 824,
866
+ "阿兰_EN": 825,
867
+ "奥列格_EN": 826,
868
+ "丹枢_EN": 827,
869
+ "尾巴_EN": 828,
870
+ "寒鸦_EN": 829,
871
+ "雪衣_EN": 830,
872
+ "可可利亚_EN": 831,
873
+ "青镞_EN": 832,
874
+ "半夏_EN": 833,
875
+ "银枝_EN": 834,
876
+ "大毫_EN": 835,
877
+ "霄翰_EN": 836,
878
+ "信使_EN": 837,
879
+ "费斯曼_EN": 838,
880
+ "绿芙蓉_EN": 839,
881
+ "dev_成男_EN": 840,
882
+ "金人会长_EN": 841,
883
+ "维利特_EN": 842,
884
+ "维尔德_EN": 843,
885
+ "_EN": 844,
886
+ "卡波特_EN": 845,
887
+ "岩明_EN": 846,
888
+ "浣溪_EN": 847,
889
+ "紫月季_EN": 848,
890
+ "幻胧_EN": 849,
891
+ "女声_EN": 850,
892
+ "陆景和": 851,
893
+ "莫弈": 852,
894
+ "左然": 853,
895
+ "夏彦": 854
896
  }
897
  },
898
  "model": {
 
949
  "use_spectral_norm": false,
950
  "gin_channels": 256
951
  },
952
+ "version": "2.2"
953
  }
css/custom.css ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #yml_code {
3
+ height: 600px;
4
+ flex-grow: inherit;
5
+ overflow-y: auto;
6
+ }
7
+
8
+ #json_code {
9
+ height: 600px;
10
+ flex-grow: inherit;
11
+ overflow-y: auto;
12
+ }
13
+
14
+ #gpu_code {
15
+ height: 300px;
16
+ flex-grow: inherit;
17
+ overflow-y: auto;
18
+ }
data_utils.py CHANGED
@@ -44,6 +44,10 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
44
  self.min_text_len = getattr(hparams, "min_text_len", 1)
45
  self.max_text_len = getattr(hparams, "max_text_len", 384)
46
 
 
 
 
 
47
  random.seed(1234)
48
  random.shuffle(self.audiopaths_sid_text)
49
  self._filter()
@@ -93,7 +97,14 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
93
 
94
  spec, wav = self.get_audio(audiopath)
95
  sid = torch.LongTensor([int(self.spk_map[sid])])
96
- emo = torch.FloatTensor(np.load(audiopath.replace(".wav", ".emo.npy")))
 
 
 
 
 
 
 
97
  return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo)
98
 
99
  def get_audio(self, filename):
@@ -157,15 +168,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
157
 
158
  if language_str == "ZH":
159
  bert = bert_ori
160
- ja_bert = torch.zeros(1024, len(phone))
161
- en_bert = torch.zeros(1024, len(phone))
162
  elif language_str == "JP":
163
- bert = torch.zeros(1024, len(phone))
164
  ja_bert = bert_ori
165
- en_bert = torch.zeros(1024, len(phone))
166
  elif language_str == "EN":
167
- bert = torch.zeros(1024, len(phone))
168
- ja_bert = torch.zeros(1024, len(phone))
169
  en_bert = bert_ori
170
  phone = torch.LongTensor(phone)
171
  tone = torch.LongTensor(tone)
@@ -215,7 +226,7 @@ class TextAudioSpeakerCollate:
215
  bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
216
  ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
217
  en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
218
- emo = torch.FloatTensor(len(batch), 1024)
219
 
220
  spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
221
  wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
 
44
  self.min_text_len = getattr(hparams, "min_text_len", 1)
45
  self.max_text_len = getattr(hparams, "max_text_len", 384)
46
 
47
+ self.empty_emo = torch.squeeze(
48
+ torch.load("empty_emo.npy", map_location="cpu"), dim=1
49
+ )
50
+
51
  random.seed(1234)
52
  random.shuffle(self.audiopaths_sid_text)
53
  self._filter()
 
97
 
98
  spec, wav = self.get_audio(audiopath)
99
  sid = torch.LongTensor([int(self.spk_map[sid])])
100
+
101
+ if np.random.rand() > 0.1:
102
+ emo = torch.squeeze(
103
+ torch.load(audiopath.replace(".wav", ".emo.npy"), map_location="cpu"),
104
+ dim=1,
105
+ )
106
+ else:
107
+ emo = self.empty_emo
108
  return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo)
109
 
110
  def get_audio(self, filename):
 
168
 
169
  if language_str == "ZH":
170
  bert = bert_ori
171
+ ja_bert = torch.rand(1024, len(phone))
172
+ en_bert = torch.rand(1024, len(phone))
173
  elif language_str == "JP":
174
+ bert = torch.rand(1024, len(phone))
175
  ja_bert = bert_ori
176
+ en_bert = torch.rand(1024, len(phone))
177
  elif language_str == "EN":
178
+ bert = torch.rand(1024, len(phone))
179
+ ja_bert = torch.rand(1024, len(phone))
180
  en_bert = bert_ori
181
  phone = torch.LongTensor(phone)
182
  tone = torch.LongTensor(tone)
 
226
  bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
227
  ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
228
  en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
229
+ emo = torch.FloatTensor(len(batch), 512)
230
 
231
  spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
232
  wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
default_config.yml CHANGED
@@ -35,10 +35,10 @@ preprocess_text:
35
  val_path: "filelists/val.list"
36
  # 配置文件路径
37
  config_path: "config.json"
38
- # 每个speaker的验证集条数
39
- val_per_spk: 4
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
- max_val_total: 8
42
  # 是否进行数据清洗
43
  clean: true
44
 
@@ -49,7 +49,7 @@ bert_gen:
49
  # 训练数据集配置文件路径
50
  config_path: "config.json"
51
  # 并行数
52
- num_processes: 2
53
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
  # 该选项同时决定了get_bert_feature的默认设备
55
  device: "cuda"
@@ -62,9 +62,11 @@ emo_gen:
62
  # 训练数据集配置文件路径
63
  config_path: "config.json"
64
  # 并行数
65
- num_processes: 2
66
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
67
  device: "cuda"
 
 
68
 
69
  # train 训练配置
70
  # 注意, “:” 后需要加空格
@@ -81,11 +83,11 @@ train_ms:
81
  base:
82
  use_base_model: false
83
  repo_id: "Stardust_minus/Bert-VITS2"
84
- model_image: "Bert-VITS2_2.1-Emo底模" # openi网页的模型名
85
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
86
  model: "models"
87
  # 配置文件路径
88
- config_path: "configs/config.json"
89
  # 训练使用的worker,不建议超过CPU核心数
90
  num_workers: 16
91
  # 关闭此项可以节约接近50%的磁盘空间,但是可能导致实际训练速度变慢和更高的CPU使用率。
@@ -100,9 +102,9 @@ webui:
100
  # 推理设备
101
  device: "cuda"
102
  # 模型路径
103
- model: "genshin/models/G_8000.pth"
104
  # 配置文件路径
105
- config_path: "configs/config.json"
106
  # 端口号
107
  port: 7860
108
  # 是否公开部署,对外网开放
@@ -113,7 +115,7 @@ webui:
113
  language_identification_library: "langid"
114
 
115
 
116
- # server api配置
117
  # 注意, “:” 后需要加空格
118
  # 注意,本配置下的所有配置均为相对于根目录的路径
119
  server:
@@ -121,8 +123,10 @@ server:
121
  port: 5000
122
  # 模型默认使用设备:但是当前并没有实现这个配置。
123
  device: "cuda"
124
- # 需要加载的所有模型的配置
 
125
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
 
126
  models:
127
  - # 模型的路径
128
  model: ""
@@ -163,7 +167,6 @@ server:
163
  # 不必填写所有人物,不填的使用默认值
164
  speakers: [ ] # 也可以不填
165
 
166
-
167
  # 百度翻译开放平台 api配置
168
  # api接入文档 https://api.fanyi.baidu.com/doc/21
169
  # 请不要在github等网站公开分享你的app id 与 key
 
35
  val_path: "filelists/val.list"
36
  # 配置文件路径
37
  config_path: "config.json"
38
+ # 每个语言的验证集条数
39
+ val_per_lang: 4
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
+ max_val_total: 12
42
  # 是否进行数据清洗
43
  clean: true
44
 
 
49
  # 训练数据集配置文件路径
50
  config_path: "config.json"
51
  # 并行数
52
+ num_processes: 4
53
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
  # 该选项同时决定了get_bert_feature的默认设备
55
  device: "cuda"
 
62
  # 训练数据集配置文件路径
63
  config_path: "config.json"
64
  # 并行数
65
+ num_processes: 4
66
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
67
  device: "cuda"
68
+ # 使用多卡推理
69
+ use_multi_device: false
70
 
71
  # train 训练配置
72
  # 注意, “:” 后需要加空格
 
83
  base:
84
  use_base_model: false
85
  repo_id: "Stardust_minus/Bert-VITS2"
86
+ model_image: "Bert-VITS2_2.2-Clap底模" # openi网页的模型名
87
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
88
  model: "models"
89
  # 配置文件路径
90
+ config_path: "config.json"
91
  # 训练使用的worker,不建议超过CPU核心数
92
  num_workers: 16
93
  # 关闭此项可以节约接近50%的磁盘空间,但是可能导致实际训练速度变慢和更高的CPU使用率。
 
102
  # 推理设备
103
  device: "cuda"
104
  # 模型路径
105
+ model: "models/G_8000.pth"
106
  # 配置文件路径
107
+ config_path: "config.json"
108
  # 端口号
109
  port: 7860
110
  # 是否公开部署,对外网开放
 
115
  language_identification_library: "langid"
116
 
117
 
118
+ # server-fastapi配置
119
  # 注意, “:” 后需要加空格
120
  # 注意,本配置下的所有配置均为相对于根目录的路径
121
  server:
 
123
  port: 5000
124
  # 模型默认使用设备:但是当前并没有实现这个配置。
125
  device: "cuda"
126
+ # 需要加载的所有模型的配置,可以填多个模型,也可以不填模型,等网页成功后手动加载模型
127
+ # 不加载模型的配置格式:删除默认给的两个模型配置,给models赋值 [ ],也就是空列表。参考模型2的speakers 即 models: [ ]
128
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
129
+ # 也可以不填模型,等网页加载成功后手动填写models。
130
  models:
131
  - # 模型的路径
132
  model: ""
 
167
  # 不必填写所有人物,不填的使用默认值
168
  speakers: [ ] # 也可以不填
169
 
 
170
  # 百度翻译开放平台 api配置
171
  # api接入文档 https://api.fanyi.baidu.com/doc/21
172
  # 请不要在github等网站公开分享你的app id 与 key
emotional/clap-htsat-fused/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
emotional/clap-htsat-fused/README.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ # Model card for CLAP
5
+
6
+ Model card for CLAP: Contrastive Language-Audio Pretraining
7
+
8
+ ![clap_image](https://s3.amazonaws.com/moonup/production/uploads/1678811100805-62441d1d9fdefb55a0b7d12c.png)
9
+
10
+
11
+ # Table of Contents
12
+
13
+ 0. [TL;DR](#TL;DR)
14
+ 1. [Model Details](#model-details)
15
+ 2. [Usage](#usage)
16
+ 3. [Uses](#uses)
17
+ 4. [Citation](#citation)
18
+
19
+ # TL;DR
20
+
21
+ The abstract of the paper states that:
22
+
23
+ > Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zero-shot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-630K and the proposed model are both available to the public.
24
+
25
+
26
+ # Usage
27
+
28
+ You can use this model for zero shot audio classification or extracting audio and/or textual features.
29
+
30
+ # Uses
31
+
32
+ ## Perform zero-shot audio classification
33
+
34
+ ### Using `pipeline`
35
+
36
+ ```python
37
+ from datasets import load_dataset
38
+ from transformers import pipeline
39
+
40
+ dataset = load_dataset("ashraq/esc50")
41
+ audio = dataset["train"]["audio"][-1]["array"]
42
+
43
+ audio_classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-fused")
44
+ output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
45
+ print(output)
46
+ >>> [{"score": 0.999, "label": "Sound of a dog"}, {"score": 0.001, "label": "Sound of vaccum cleaner"}]
47
+ ```
48
+
49
+ ## Run the model:
50
+
51
+ You can also get the audio and text embeddings using `ClapModel`
52
+
53
+ ### Run the model on CPU:
54
+
55
+ ```python
56
+ from datasets import load_dataset
57
+ from transformers import ClapModel, ClapProcessor
58
+
59
+ librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
60
+ audio_sample = librispeech_dummy[0]
61
+
62
+ model = ClapModel.from_pretrained("laion/clap-htsat-fused")
63
+ processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
64
+
65
+ inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt")
66
+ audio_embed = model.get_audio_features(**inputs)
67
+ ```
68
+
69
+ ### Run the model on GPU:
70
+
71
+ ```python
72
+ from datasets import load_dataset
73
+ from transformers import ClapModel, ClapProcessor
74
+
75
+ librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
76
+ audio_sample = librispeech_dummy[0]
77
+
78
+ model = ClapModel.from_pretrained("laion/clap-htsat-fused").to(0)
79
+ processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
80
+
81
+ inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt").to(0)
82
+ audio_embed = model.get_audio_features(**inputs)
83
+ ```
84
+
85
+
86
+ # Citation
87
+
88
+ If you are using this model for your work, please consider citing the original paper:
89
+ ```
90
+ @misc{https://doi.org/10.48550/arxiv.2211.06687,
91
+ doi = {10.48550/ARXIV.2211.06687},
92
+
93
+ url = {https://arxiv.org/abs/2211.06687},
94
+
95
+ author = {Wu, Yusong and Chen, Ke and Zhang, Tianyu and Hui, Yuchen and Berg-Kirkpatrick, Taylor and Dubnov, Shlomo},
96
+
97
+ keywords = {Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
98
+
99
+ title = {Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
100
+
101
+ publisher = {arXiv},
102
+
103
+ year = {2022},
104
+
105
+ copyright = {Creative Commons Attribution 4.0 International}
106
+ }
107
+ ```
emotional/clap-htsat-fused/config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "architectures": [
4
+ "ClapModel"
5
+ ],
6
+ "audio_config": {
7
+ "_name_or_path": "",
8
+ "add_cross_attention": false,
9
+ "aff_block_r": 4,
10
+ "architectures": null,
11
+ "attention_probs_dropout_prob": 0.0,
12
+ "bad_words_ids": null,
13
+ "begin_suppress_tokens": null,
14
+ "bos_token_id": null,
15
+ "chunk_size_feed_forward": 0,
16
+ "cross_attention_hidden_size": null,
17
+ "decoder_start_token_id": null,
18
+ "depths": [
19
+ 2,
20
+ 2,
21
+ 6,
22
+ 2
23
+ ],
24
+ "diversity_penalty": 0.0,
25
+ "do_sample": false,
26
+ "drop_path_rate": 0.0,
27
+ "early_stopping": false,
28
+ "enable_fusion": true,
29
+ "enable_patch_fusion": true,
30
+ "enable_patch_layer_norm": true,
31
+ "encoder_no_repeat_ngram_size": 0,
32
+ "eos_token_id": null,
33
+ "exponential_decay_length_penalty": null,
34
+ "finetuning_task": null,
35
+ "flatten_patch_embeds": true,
36
+ "forced_bos_token_id": null,
37
+ "forced_eos_token_id": null,
38
+ "fusion_num_hidden_layers": 2,
39
+ "fusion_type": null,
40
+ "hidden_act": "gelu",
41
+ "hidden_dropout_prob": 0.1,
42
+ "hidden_size": 768,
43
+ "id2label": {
44
+ "0": "LABEL_0",
45
+ "1": "LABEL_1"
46
+ },
47
+ "initializer_factor": 1.0,
48
+ "is_decoder": false,
49
+ "is_encoder_decoder": false,
50
+ "label2id": {
51
+ "LABEL_0": 0,
52
+ "LABEL_1": 1
53
+ },
54
+ "layer_norm_eps": 1e-05,
55
+ "length_penalty": 1.0,
56
+ "max_length": 20,
57
+ "min_length": 0,
58
+ "mlp_ratio": 4.0,
59
+ "model_type": "clap_audio_model",
60
+ "no_repeat_ngram_size": 0,
61
+ "num_attention_heads": [
62
+ 4,
63
+ 8,
64
+ 16,
65
+ 32
66
+ ],
67
+ "num_beam_groups": 1,
68
+ "num_beams": 1,
69
+ "num_classes": 527,
70
+ "num_hidden_layers": 4,
71
+ "num_mel_bins": 64,
72
+ "num_return_sequences": 1,
73
+ "output_attentions": false,
74
+ "output_hidden_states": false,
75
+ "output_scores": false,
76
+ "pad_token_id": null,
77
+ "patch_embed_input_channels": 1,
78
+ "patch_embeds_hidden_size": 96,
79
+ "patch_size": 4,
80
+ "patch_stride": [
81
+ 4,
82
+ 4
83
+ ],
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "projection_dim": 512,
87
+ "projection_hidden_act": "relu",
88
+ "projection_hidden_size": 768,
89
+ "pruned_heads": {},
90
+ "qkv_bias": true,
91
+ "remove_invalid_values": false,
92
+ "repetition_penalty": 1.0,
93
+ "return_dict": true,
94
+ "return_dict_in_generate": false,
95
+ "sep_token_id": null,
96
+ "spec_size": 256,
97
+ "suppress_tokens": null,
98
+ "task_specific_params": null,
99
+ "temperature": 1.0,
100
+ "tf_legacy_loss": false,
101
+ "tie_encoder_decoder": false,
102
+ "tie_word_embeddings": true,
103
+ "tokenizer_class": null,
104
+ "top_k": 50,
105
+ "top_p": 1.0,
106
+ "torch_dtype": null,
107
+ "torchscript": false,
108
+ "transformers_version": "4.27.0.dev0",
109
+ "typical_p": 1.0,
110
+ "use_bfloat16": false,
111
+ "window_size": 8
112
+ },
113
+ "hidden_size": 768,
114
+ "initializer_factor": 1.0,
115
+ "logit_scale_init_value": 14.285714285714285,
116
+ "model_type": "clap",
117
+ "num_hidden_layers": 16,
118
+ "projection_dim": 512,
119
+ "projection_hidden_act": "relu",
120
+ "text_config": {
121
+ "_name_or_path": "",
122
+ "add_cross_attention": false,
123
+ "architectures": null,
124
+ "attention_probs_dropout_prob": 0.1,
125
+ "bad_words_ids": null,
126
+ "begin_suppress_tokens": null,
127
+ "bos_token_id": 0,
128
+ "chunk_size_feed_forward": 0,
129
+ "classifier_dropout": null,
130
+ "cross_attention_hidden_size": null,
131
+ "decoder_start_token_id": null,
132
+ "diversity_penalty": 0.0,
133
+ "do_sample": false,
134
+ "early_stopping": false,
135
+ "encoder_no_repeat_ngram_size": 0,
136
+ "eos_token_id": 2,
137
+ "exponential_decay_length_penalty": null,
138
+ "finetuning_task": null,
139
+ "forced_bos_token_id": null,
140
+ "forced_eos_token_id": null,
141
+ "fusion_hidden_size": 768,
142
+ "fusion_num_hidden_layers": 2,
143
+ "hidden_act": "gelu",
144
+ "hidden_dropout_prob": 0.1,
145
+ "hidden_size": 768,
146
+ "id2label": {
147
+ "0": "LABEL_0",
148
+ "1": "LABEL_1"
149
+ },
150
+ "initializer_factor": 1.0,
151
+ "initializer_range": 0.02,
152
+ "intermediate_size": 3072,
153
+ "is_decoder": false,
154
+ "is_encoder_decoder": false,
155
+ "label2id": {
156
+ "LABEL_0": 0,
157
+ "LABEL_1": 1
158
+ },
159
+ "layer_norm_eps": 1e-12,
160
+ "length_penalty": 1.0,
161
+ "max_length": 20,
162
+ "max_position_embeddings": 514,
163
+ "min_length": 0,
164
+ "model_type": "clap_text_model",
165
+ "no_repeat_ngram_size": 0,
166
+ "num_attention_heads": 12,
167
+ "num_beam_groups": 1,
168
+ "num_beams": 1,
169
+ "num_hidden_layers": 12,
170
+ "num_return_sequences": 1,
171
+ "output_attentions": false,
172
+ "output_hidden_states": false,
173
+ "output_scores": false,
174
+ "pad_token_id": 1,
175
+ "position_embedding_type": "absolute",
176
+ "prefix": null,
177
+ "problem_type": null,
178
+ "projection_dim": 512,
179
+ "projection_hidden_act": "relu",
180
+ "projection_hidden_size": 768,
181
+ "pruned_heads": {},
182
+ "remove_invalid_values": false,
183
+ "repetition_penalty": 1.0,
184
+ "return_dict": true,
185
+ "return_dict_in_generate": false,
186
+ "sep_token_id": null,
187
+ "suppress_tokens": null,
188
+ "task_specific_params": null,
189
+ "temperature": 1.0,
190
+ "tf_legacy_loss": false,
191
+ "tie_encoder_decoder": false,
192
+ "tie_word_embeddings": true,
193
+ "tokenizer_class": null,
194
+ "top_k": 50,
195
+ "top_p": 1.0,
196
+ "torch_dtype": null,
197
+ "torchscript": false,
198
+ "transformers_version": "4.27.0.dev0",
199
+ "type_vocab_size": 1,
200
+ "typical_p": 1.0,
201
+ "use_bfloat16": false,
202
+ "use_cache": true,
203
+ "vocab_size": 50265
204
+ },
205
+ "torch_dtype": "float32",
206
+ "transformers_version": null
207
+ }
emotional/clap-htsat-fused/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
emotional/clap-htsat-fused/preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length_s": 10,
3
+ "feature_extractor_type": "ClapFeatureExtractor",
4
+ "feature_size": 64,
5
+ "fft_window_size": 1024,
6
+ "frequency_max": 14000,
7
+ "frequency_min": 50,
8
+ "hop_length": 480,
9
+ "max_length_s": 10,
10
+ "n_fft": 1024,
11
+ "nb_frequency_bins": 513,
12
+ "nb_max_frames": 1000,
13
+ "nb_max_samples": 480000,
14
+ "padding": "repeatpad",
15
+ "padding_side": "right",
16
+ "padding_value": 0.0,
17
+ "processor_class": "ClapProcessor",
18
+ "return_attention_mask": false,
19
+ "sampling_rate": 48000,
20
+ "top_db": null,
21
+ "truncation": "fusion"
22
+ }
emotional/clap-htsat-fused/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed5d0215d887551ddd0a49ce7311b21429ebdf1e6a129d4e68f743357225253
3
+ size 614596545
emotional/clap-htsat-fused/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
emotional/clap-htsat-fused/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
emotional/clap-htsat-fused/tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "errors": "replace",
7
+ "mask_token": "<mask>",
8
+ "model_max_length": 512,
9
+ "pad_token": "<pad>",
10
+ "processor_class": "ClapProcessor",
11
+ "sep_token": "</s>",
12
+ "special_tokens_map_file": null,
13
+ "tokenizer_class": "RobertaTokenizer",
14
+ "trim_offsets": true,
15
+ "unk_token": "<unk>"
16
+ }
emotional/clap-htsat-fused/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
empty_emo.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07063411ab7d6e7aacfc73c582616c3fbc8fdf518b20d42d8be77bc9caf6fab9
3
+ size 3238
export_onnx.py CHANGED
@@ -1,54 +1,10 @@
1
- from models_onnx import SynthesizerTrn
2
- import utils
3
- from text.symbols import symbols
4
  import os
5
- import json
6
-
7
-
8
- def export_onnx(export_path, model_path, config_path):
9
- hps = utils.get_hparams_from_file(config_path)
10
- net_g = SynthesizerTrn(
11
- len(symbols),
12
- hps.data.filter_length // 2 + 1,
13
- hps.train.segment_size // hps.data.hop_length,
14
- n_speakers=hps.data.n_speakers,
15
- **hps.model,
16
- )
17
- _ = net_g.eval()
18
- _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
19
- net_g.export_onnx(export_path)
20
-
21
- spklist = []
22
- for key in hps.data.spk2id.keys():
23
- spklist.append(key)
24
-
25
- MoeVSConf = {
26
- "Folder": f"{export_path}",
27
- "Name": f"{export_path}",
28
- "Type": "BertVits",
29
- "Symbol": symbols,
30
- "Cleaner": "",
31
- "Rate": hps.data.sampling_rate,
32
- "CharaMix": True,
33
- "Characters": spklist,
34
- "LanguageMap": {"ZH": [0, 0], "JP": [1, 6], "EN": [2, 8]},
35
- "Dict": "BasicDict",
36
- "BertPath": [
37
- "chinese-roberta-wwm-ext-large",
38
- "deberta-v2-large-japanese",
39
- "bert-base-japanese-v3",
40
- ],
41
- }
42
-
43
- with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile:
44
- json.dump(MoeVSConf, MoeVsConfFile, indent=4)
45
-
46
 
47
  if __name__ == "__main__":
48
- print(symbols)
49
- export_path = "HimenoSena"
50
- model_path = "G_53000.pth"
51
- config_path = "config.json"
52
  if not os.path.exists("onnx"):
53
  os.makedirs("onnx")
54
  if not os.path.exists(f"onnx/{export_path}"):
 
1
+ from onnx_modules import export_onnx
 
 
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  if __name__ == "__main__":
5
+ export_path = "BertVits2.2PT"
6
+ model_path = "model\\G_0.pth"
7
+ config_path = "model\\config.json"
 
8
  if not os.path.exists("onnx"):
9
  os.makedirs("onnx")
10
  if not os.path.exists(f"onnx/{export_path}"):
img/yuyu.png ADDED
img//345/217/202/346/225/260/350/257/264/346/230/216.png ADDED
img//345/256/265/345/256/253.png ADDED
img//345/276/256/344/277/241/345/233/276/347/211/207_20231010105112.png ADDED
img//347/245/236/351/207/214/347/273/253/345/215/216.png ADDED
img//347/272/263/350/245/277/345/246/262.png ADDED
infer.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 版本管理、兼容推理及模型加载实现。
3
+ 版本说明:
4
+ 1. 版本号与github的release版本号对应,使用哪个release版本训练的模型即对应其版本号
5
+ 2. 请在模型的config.json中显示声明版本号,添加一个字段"version" : "你的版本号"
6
+ 特殊版本说明:
7
+ 1.1.1-fix: 1.1.1版本训练的模型,但是在推理时使用dev的日语修复
8
+ 2.2:当前版本
9
+ """
10
+ import torch
11
+ import commons
12
+ from text import cleaned_text_to_sequence, get_bert
13
+ from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
14
+ from text.cleaner import clean_text
15
+ import utils
16
+ import numpy as np
17
+
18
+ from models import SynthesizerTrn
19
+ from text.symbols import symbols
20
+
21
+ from oldVersion.V210.models import SynthesizerTrn as V210SynthesizerTrn
22
+ from oldVersion.V210.text import symbols as V210symbols
23
+ from oldVersion.V200.models import SynthesizerTrn as V200SynthesizerTrn
24
+ from oldVersion.V200.text import symbols as V200symbols
25
+ from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn
26
+ from oldVersion.V111.text import symbols as V111symbols
27
+ from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn
28
+ from oldVersion.V110.text import symbols as V110symbols
29
+ from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn
30
+ from oldVersion.V101.text import symbols as V101symbols
31
+
32
+ from oldVersion import V111, V110, V101, V200, V210
33
+
34
+ # 当前版本信息
35
+ latest_version = "2.2"
36
+
37
+ # 版本兼容
38
+ SynthesizerTrnMap = {
39
+ "2.1": V210SynthesizerTrn,
40
+ "2.0.2-fix": V200SynthesizerTrn,
41
+ "2.0.1": V200SynthesizerTrn,
42
+ "2.0": V200SynthesizerTrn,
43
+ "1.1.1-fix": V111SynthesizerTrn,
44
+ "1.1.1": V111SynthesizerTrn,
45
+ "1.1": V110SynthesizerTrn,
46
+ "1.1.0": V110SynthesizerTrn,
47
+ "1.0.1": V101SynthesizerTrn,
48
+ "1.0": V101SynthesizerTrn,
49
+ "1.0.0": V101SynthesizerTrn,
50
+ }
51
+
52
+ symbolsMap = {
53
+ "2.1": V210symbols,
54
+ "2.0.2-fix": V200symbols,
55
+ "2.0.1": V200symbols,
56
+ "2.0": V200symbols,
57
+ "1.1.1-fix": V111symbols,
58
+ "1.1.1": V111symbols,
59
+ "1.1": V110symbols,
60
+ "1.1.0": V110symbols,
61
+ "1.0.1": V101symbols,
62
+ "1.0": V101symbols,
63
+ "1.0.0": V101symbols,
64
+ }
65
+
66
+
67
+ # def get_emo_(reference_audio, emotion, sid):
68
+ # emo = (
69
+ # torch.from_numpy(get_emo(reference_audio))
70
+ # if reference_audio and emotion == -1
71
+ # else torch.FloatTensor(
72
+ # np.load(f"emo_clustering/{sid}/cluster_center_{emotion}.npy")
73
+ # )
74
+ # )
75
+ # return emo
76
+
77
+
78
+ def get_net_g(model_path: str, version: str, device: str, hps):
79
+ if version != latest_version:
80
+ net_g = SynthesizerTrnMap[version](
81
+ len(symbolsMap[version]),
82
+ hps.data.filter_length // 2 + 1,
83
+ hps.train.segment_size // hps.data.hop_length,
84
+ n_speakers=hps.data.n_speakers,
85
+ **hps.model,
86
+ ).to(device)
87
+ else:
88
+ # 当前版本模型 net_g
89
+ net_g = SynthesizerTrn(
90
+ len(symbols),
91
+ hps.data.filter_length // 2 + 1,
92
+ hps.train.segment_size // hps.data.hop_length,
93
+ n_speakers=hps.data.n_speakers,
94
+ **hps.model,
95
+ ).to(device)
96
+ _ = net_g.eval()
97
+ _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
98
+ return net_g
99
+
100
+
101
+ def get_text(text, language_str, hps, device):
102
+ # 在此处实现当前版本的get_text
103
+ norm_text, phone, tone, word2ph = clean_text(text, language_str)
104
+ phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
105
+
106
+ if hps.data.add_blank:
107
+ phone = commons.intersperse(phone, 0)
108
+ tone = commons.intersperse(tone, 0)
109
+ language = commons.intersperse(language, 0)
110
+ for i in range(len(word2ph)):
111
+ word2ph[i] = word2ph[i] * 2
112
+ word2ph[0] += 1
113
+ bert_ori = get_bert(norm_text, word2ph, language_str, device)
114
+ del word2ph
115
+ assert bert_ori.shape[-1] == len(phone), phone
116
+
117
+ if language_str == "ZH":
118
+ bert = bert_ori
119
+ ja_bert = torch.rand(1024, len(phone))
120
+ en_bert = torch.rand(1024, len(phone))
121
+ elif language_str == "JP":
122
+ bert = torch.rand(1024, len(phone))
123
+ ja_bert = bert_ori
124
+ en_bert = torch.rand(1024, len(phone))
125
+ elif language_str == "EN":
126
+ bert = torch.rand(1024, len(phone))
127
+ ja_bert = torch.rand(1024, len(phone))
128
+ en_bert = bert_ori
129
+ else:
130
+ raise ValueError("language_str should be ZH, JP or EN")
131
+
132
+ assert bert.shape[-1] == len(
133
+ phone
134
+ ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
135
+
136
+ phone = torch.LongTensor(phone)
137
+ tone = torch.LongTensor(tone)
138
+ language = torch.LongTensor(language)
139
+ return bert, ja_bert, en_bert, phone, tone, language
140
+
141
+
142
+ def infer(
143
+ text,
144
+ emotion,
145
+ sdp_ratio,
146
+ noise_scale,
147
+ noise_scale_w,
148
+ length_scale,
149
+ sid,
150
+ language,
151
+ hps,
152
+ net_g,
153
+ device,
154
+ reference_audio=None,
155
+ skip_start=False,
156
+ skip_end=False,
157
+ ):
158
+ # 2.2版本参数位置变了
159
+ # 2.1 参数新增 emotion reference_audio skip_start skip_end
160
+ inferMap_V3 = {
161
+ "2.1": V210.infer,
162
+ }
163
+ # 支持中日英三语版本
164
+ inferMap_V2 = {
165
+ "2.0.2-fix": V200.infer,
166
+ "2.0.1": V200.infer,
167
+ "2.0": V200.infer,
168
+ "1.1.1-fix": V111.infer_fix,
169
+ "1.1.1": V111.infer,
170
+ "1.1": V110.infer,
171
+ "1.1.0": V110.infer,
172
+ }
173
+ # 仅支持中文版本
174
+ # 在测试中,并未发现两个版本的模型不能互相通用
175
+ inferMap_V1 = {
176
+ "1.0.1": V101.infer,
177
+ "1.0": V101.infer,
178
+ "1.0.0": V101.infer,
179
+ }
180
+ version = hps.version if hasattr(hps, "version") else latest_version
181
+ # 非当前版本,根据版本号选择合适的infer
182
+ if version != latest_version:
183
+ if version in inferMap_V3.keys():
184
+ return inferMap_V3[version](
185
+ text,
186
+ sdp_ratio,
187
+ noise_scale,
188
+ noise_scale_w,
189
+ length_scale,
190
+ sid,
191
+ language,
192
+ hps,
193
+ net_g,
194
+ device,
195
+ reference_audio,
196
+ emotion,
197
+ skip_start,
198
+ skip_end,
199
+ )
200
+ if version in inferMap_V2.keys():
201
+ return inferMap_V2[version](
202
+ text,
203
+ sdp_ratio,
204
+ noise_scale,
205
+ noise_scale_w,
206
+ length_scale,
207
+ sid,
208
+ language,
209
+ hps,
210
+ net_g,
211
+ device,
212
+ )
213
+ if version in inferMap_V1.keys():
214
+ return inferMap_V1[version](
215
+ text,
216
+ sdp_ratio,
217
+ noise_scale,
218
+ noise_scale_w,
219
+ length_scale,
220
+ sid,
221
+ hps,
222
+ net_g,
223
+ device,
224
+ )
225
+ # 在此处实现当前版本的推理
226
+ # emo = get_emo_(reference_audio, emotion, sid)
227
+ if isinstance(reference_audio, np.ndarray):
228
+ emo = get_clap_audio_feature(reference_audio, device)
229
+ else:
230
+ emo = get_clap_text_feature(emotion, device)
231
+ emo = torch.squeeze(emo, dim=1)
232
+
233
+ bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
234
+ text, language, hps, device
235
+ )
236
+ if skip_start:
237
+ phones = phones[3:]
238
+ tones = tones[3:]
239
+ lang_ids = lang_ids[3:]
240
+ bert = bert[:, 3:]
241
+ ja_bert = ja_bert[:, 3:]
242
+ en_bert = en_bert[:, 3:]
243
+ if skip_end:
244
+ phones = phones[:-2]
245
+ tones = tones[:-2]
246
+ lang_ids = lang_ids[:-2]
247
+ bert = bert[:, :-2]
248
+ ja_bert = ja_bert[:, :-2]
249
+ en_bert = en_bert[:, :-2]
250
+ with torch.no_grad():
251
+ x_tst = phones.to(device).unsqueeze(0)
252
+ tones = tones.to(device).unsqueeze(0)
253
+ lang_ids = lang_ids.to(device).unsqueeze(0)
254
+ bert = bert.to(device).unsqueeze(0)
255
+ ja_bert = ja_bert.to(device).unsqueeze(0)
256
+ en_bert = en_bert.to(device).unsqueeze(0)
257
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
258
+ emo = emo.to(device).unsqueeze(0)
259
+ del phones
260
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
261
+ audio = (
262
+ net_g.infer(
263
+ x_tst,
264
+ x_tst_lengths,
265
+ speakers,
266
+ tones,
267
+ lang_ids,
268
+ bert,
269
+ ja_bert,
270
+ en_bert,
271
+ emo,
272
+ sdp_ratio=sdp_ratio,
273
+ noise_scale=noise_scale,
274
+ noise_scale_w=noise_scale_w,
275
+ length_scale=length_scale,
276
+ )[0][0, 0]
277
+ .data.cpu()
278
+ .float()
279
+ .numpy()
280
+ )
281
+ del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
282
+ if torch.cuda.is_available():
283
+ torch.cuda.empty_cache()
284
+ return audio
285
+
286
+
287
+ def infer_multilang(
288
+ text,
289
+ sdp_ratio,
290
+ noise_scale,
291
+ noise_scale_w,
292
+ length_scale,
293
+ sid,
294
+ language,
295
+ hps,
296
+ net_g,
297
+ device,
298
+ reference_audio=None,
299
+ emotion=None,
300
+ skip_start=False,
301
+ skip_end=False,
302
+ ):
303
+ bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], []
304
+ # emo = get_emo_(reference_audio, emotion, sid)
305
+ if isinstance(reference_audio, np.ndarray):
306
+ emo = get_clap_audio_feature(reference_audio, device)
307
+ else:
308
+ emo = get_clap_text_feature(emotion, device)
309
+ emo = torch.squeeze(emo, dim=1)
310
+ for idx, (txt, lang) in enumerate(zip(text, language)):
311
+ skip_start = (idx != 0) or (skip_start and idx == 0)
312
+ skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1)
313
+ (
314
+ temp_bert,
315
+ temp_ja_bert,
316
+ temp_en_bert,
317
+ temp_phones,
318
+ temp_tones,
319
+ temp_lang_ids,
320
+ ) = get_text(txt, lang, hps, device)
321
+ if skip_start:
322
+ temp_bert = temp_bert[:, 3:]
323
+ temp_ja_bert = temp_ja_bert[:, 3:]
324
+ temp_en_bert = temp_en_bert[:, 3:]
325
+ temp_phones = temp_phones[3:]
326
+ temp_tones = temp_tones[3:]
327
+ temp_lang_ids = temp_lang_ids[3:]
328
+ if skip_end:
329
+ temp_bert = temp_bert[:, :-2]
330
+ temp_ja_bert = temp_ja_bert[:, :-2]
331
+ temp_en_bert = temp_en_bert[:, :-2]
332
+ temp_phones = temp_phones[:-2]
333
+ temp_tones = temp_tones[:-2]
334
+ temp_lang_ids = temp_lang_ids[:-2]
335
+ bert.append(temp_bert)
336
+ ja_bert.append(temp_ja_bert)
337
+ en_bert.append(temp_en_bert)
338
+ phones.append(temp_phones)
339
+ tones.append(temp_tones)
340
+ lang_ids.append(temp_lang_ids)
341
+ bert = torch.concatenate(bert, dim=1)
342
+ ja_bert = torch.concatenate(ja_bert, dim=1)
343
+ en_bert = torch.concatenate(en_bert, dim=1)
344
+ phones = torch.concatenate(phones, dim=0)
345
+ tones = torch.concatenate(tones, dim=0)
346
+ lang_ids = torch.concatenate(lang_ids, dim=0)
347
+ with torch.no_grad():
348
+ x_tst = phones.to(device).unsqueeze(0)
349
+ tones = tones.to(device).unsqueeze(0)
350
+ lang_ids = lang_ids.to(device).unsqueeze(0)
351
+ bert = bert.to(device).unsqueeze(0)
352
+ ja_bert = ja_bert.to(device).unsqueeze(0)
353
+ en_bert = en_bert.to(device).unsqueeze(0)
354
+ emo = emo.to(device).unsqueeze(0)
355
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
356
+ del phones
357
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
358
+ audio = (
359
+ net_g.infer(
360
+ x_tst,
361
+ x_tst_lengths,
362
+ speakers,
363
+ tones,
364
+ lang_ids,
365
+ bert,
366
+ ja_bert,
367
+ en_bert,
368
+ emo,
369
+ sdp_ratio=sdp_ratio,
370
+ noise_scale=noise_scale,
371
+ noise_scale_w=noise_scale_w,
372
+ length_scale=length_scale,
373
+ )[0][0, 0]
374
+ .data.cpu()
375
+ .float()
376
+ .numpy()
377
+ )
378
+ del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
379
+ if torch.cuda.is_available():
380
+ torch.cuda.empty_cache()
381
+ return audio
models.py CHANGED
@@ -10,11 +10,12 @@ import monotonic_align
10
 
11
  from torch.nn import Conv1d, ConvTranspose1d, Conv2d
12
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
- from vector_quantize_pytorch import VectorQuantize
14
 
15
  from commons import init_weights, get_padding
16
  from text import symbols, num_tones, num_languages
17
 
 
 
18
 
19
  class DurationDiscriminator(nn.Module): # vits2
20
  def __init__(
@@ -311,6 +312,37 @@ class DurationPredictor(nn.Module):
311
  return x * x_mask
312
 
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  class TextEncoder(nn.Module):
315
  def __init__(
316
  self,
@@ -344,18 +376,31 @@ class TextEncoder(nn.Module):
344
  self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
345
  self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
346
  self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
347
- self.emo_proj = nn.Linear(1024, 1024)
348
- self.emo_quantizer = [
349
- VectorQuantize(
350
- dim=1024,
351
- codebook_size=10,
352
- decay=0.8,
353
- commitment_weight=1.0,
354
- learnable_codebook=True,
355
- ema_update=False,
356
- )
357
- ] * n_speakers
358
- self.emo_q_proj = nn.Linear(1024, hidden_channels)
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
360
  self.encoder = attentions.Encoder(
361
  hidden_channels,
@@ -375,26 +420,11 @@ class TextEncoder(nn.Module):
375
  bert_emb = self.bert_proj(bert).transpose(1, 2)
376
  ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
377
  en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
378
- if emo.size(-1) == 1024:
379
- emo_emb = self.emo_proj(emo.unsqueeze(1))
380
- emo_commit_loss = torch.zeros(1)
381
- emo_emb_ = []
382
- for i in range(emo_emb.size(0)):
383
- temp_emo_emb, _, temp_emo_commit_loss = self.emo_quantizer[sid[i]](
384
- emo_emb[i].unsqueeze(0).cpu()
385
- )
386
- emo_commit_loss += temp_emo_commit_loss
387
- emo_emb_.append(temp_emo_emb)
388
- emo_emb = torch.cat(emo_emb_, dim=0).to(emo_emb.device)
389
- emo_commit_loss = emo_commit_loss.to(emo_emb.device)
390
- else:
391
- emo_emb = (
392
- self.emo_quantizer[sid[0]]
393
- .get_output_from_indices(emo.to(torch.int).cpu())
394
- .unsqueeze(0)
395
- .to(emo.device)
396
- )
397
- emo_commit_loss = torch.zeros(1)
398
  x = (
399
  self.emb(x)
400
  + self.tone_emb(tone)
@@ -402,7 +432,7 @@ class TextEncoder(nn.Module):
402
  + bert_emb
403
  + ja_bert_emb
404
  + en_bert_emb
405
- + self.emo_q_proj(emo_emb)
406
  ) * math.sqrt(
407
  self.hidden_channels
408
  ) # [b, t, h]
@@ -415,7 +445,7 @@ class TextEncoder(nn.Module):
415
  stats = self.proj(x) * x_mask
416
 
417
  m, logs = torch.split(stats, self.out_channels, dim=1)
418
- return x, m, logs, x_mask, emo_commit_loss
419
 
420
 
421
  class ResidualCouplingBlock(nn.Module):
@@ -989,6 +1019,7 @@ class SynthesizerTrn(nn.Module):
989
  y_mask,
990
  (z, z_p, m_p, logs_p, m_q, logs_q),
991
  (x, logw, logw_),
 
992
  loss_commit,
993
  )
994
 
 
10
 
11
  from torch.nn import Conv1d, ConvTranspose1d, Conv2d
12
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 
13
 
14
  from commons import init_weights, get_padding
15
  from text import symbols, num_tones, num_languages
16
 
17
+ from vector_quantize_pytorch import VectorQuantize
18
+
19
 
20
  class DurationDiscriminator(nn.Module): # vits2
21
  def __init__(
 
312
  return x * x_mask
313
 
314
 
315
+ class Bottleneck(nn.Sequential):
316
+ def __init__(self, in_dim, hidden_dim):
317
+ c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
318
+ c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
319
+ super().__init__(*[c_fc1, c_fc2])
320
+
321
+
322
+ class Block(nn.Module):
323
+ def __init__(self, in_dim, hidden_dim) -> None:
324
+ super().__init__()
325
+ self.norm = nn.LayerNorm(in_dim)
326
+ self.mlp = MLP(in_dim, hidden_dim)
327
+
328
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
329
+ x = x + self.mlp(self.norm(x))
330
+ return x
331
+
332
+
333
+ class MLP(nn.Module):
334
+ def __init__(self, in_dim, hidden_dim):
335
+ super().__init__()
336
+ self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
337
+ self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
338
+ self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
339
+
340
+ def forward(self, x: torch.Tensor):
341
+ x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
342
+ x = self.c_proj(x)
343
+ return x
344
+
345
+
346
  class TextEncoder(nn.Module):
347
  def __init__(
348
  self,
 
376
  self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
377
  self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
378
  self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
379
+ # self.emo_proj = nn.Linear(512, hidden_channels)
380
+ self.in_feature_net = nn.Sequential(
381
+ # input is assumed to an already normalized embedding
382
+ nn.Linear(512, 1028, bias=False),
383
+ nn.GELU(),
384
+ nn.LayerNorm(1028),
385
+ *[Block(1028, 512) for _ in range(1)],
386
+ nn.Linear(1028, 512, bias=False),
387
+ # normalize before passing to VQ?
388
+ # nn.GELU(),
389
+ # nn.LayerNorm(512),
390
+ )
391
+ self.emo_vq = VectorQuantize(
392
+ dim=512,
393
+ codebook_size=64,
394
+ codebook_dim=32,
395
+ commitment_weight=0.1,
396
+ decay=0.85,
397
+ heads=32,
398
+ kmeans_iters=20,
399
+ separate_codebook_per_head=True,
400
+ stochastic_sample_codes=True,
401
+ threshold_ema_dead_code=2,
402
+ )
403
+ self.out_feature_net = nn.Linear(512, hidden_channels)
404
 
405
  self.encoder = attentions.Encoder(
406
  hidden_channels,
 
420
  bert_emb = self.bert_proj(bert).transpose(1, 2)
421
  ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
422
  en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
423
+ emo_emb = self.in_feature_net(emo)
424
+ emo_emb, _, loss_commit = self.emo_vq(emo_emb.unsqueeze(1))
425
+ loss_commit = loss_commit.mean()
426
+ emo_emb = self.out_feature_net(emo_emb)
427
+ # emo_emb = self.emo_proj(emo.unsqueeze(1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  x = (
429
  self.emb(x)
430
  + self.tone_emb(tone)
 
432
  + bert_emb
433
  + ja_bert_emb
434
  + en_bert_emb
435
+ + emo_emb
436
  ) * math.sqrt(
437
  self.hidden_channels
438
  ) # [b, t, h]
 
445
  stats = self.proj(x) * x_mask
446
 
447
  m, logs = torch.split(stats, self.out_channels, dim=1)
448
+ return x, m, logs, x_mask, loss_commit
449
 
450
 
451
  class ResidualCouplingBlock(nn.Module):
 
1019
  y_mask,
1020
  (z, z_p, m_p, logs_p, m_q, logs_q),
1021
  (x, logw, logw_),
1022
+ g,
1023
  loss_commit,
1024
  )
1025
 
monotonic_align/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.64 kB). View file
 
monotonic_align/__pycache__/core.cpython-311.pyc ADDED
Binary file (2 kB). View file
 
onnx_modules/V200/__init__.py ADDED
File without changes
onnx_modules/V200/attentions_onnx.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ import commons
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class LayerNorm(nn.Module):
13
+ def __init__(self, channels, eps=1e-5):
14
+ super().__init__()
15
+ self.channels = channels
16
+ self.eps = eps
17
+
18
+ self.gamma = nn.Parameter(torch.ones(channels))
19
+ self.beta = nn.Parameter(torch.zeros(channels))
20
+
21
+ def forward(self, x):
22
+ x = x.transpose(1, -1)
23
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
24
+ return x.transpose(1, -1)
25
+
26
+
27
+ @torch.jit.script
28
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
29
+ n_channels_int = n_channels[0]
30
+ in_act = input_a + input_b
31
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
32
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
33
+ acts = t_act * s_act
34
+ return acts
35
+
36
+
37
+ class Encoder(nn.Module):
38
+ def __init__(
39
+ self,
40
+ hidden_channels,
41
+ filter_channels,
42
+ n_heads,
43
+ n_layers,
44
+ kernel_size=1,
45
+ p_dropout=0.0,
46
+ window_size=4,
47
+ isflow=True,
48
+ **kwargs
49
+ ):
50
+ super().__init__()
51
+ self.hidden_channels = hidden_channels
52
+ self.filter_channels = filter_channels
53
+ self.n_heads = n_heads
54
+ self.n_layers = n_layers
55
+ self.kernel_size = kernel_size
56
+ self.p_dropout = p_dropout
57
+ self.window_size = window_size
58
+ # if isflow:
59
+ # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
60
+ # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
61
+ # self.cond_layer = weight_norm(cond_layer, name='weight')
62
+ # self.gin_channels = 256
63
+ self.cond_layer_idx = self.n_layers
64
+ if "gin_channels" in kwargs:
65
+ self.gin_channels = kwargs["gin_channels"]
66
+ if self.gin_channels != 0:
67
+ self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
68
+ # vits2 says 3rd block, so idx is 2 by default
69
+ self.cond_layer_idx = (
70
+ kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
71
+ )
72
+ logging.debug(self.gin_channels, self.cond_layer_idx)
73
+ assert (
74
+ self.cond_layer_idx < self.n_layers
75
+ ), "cond_layer_idx should be less than n_layers"
76
+ self.drop = nn.Dropout(p_dropout)
77
+ self.attn_layers = nn.ModuleList()
78
+ self.norm_layers_1 = nn.ModuleList()
79
+ self.ffn_layers = nn.ModuleList()
80
+ self.norm_layers_2 = nn.ModuleList()
81
+ for i in range(self.n_layers):
82
+ self.attn_layers.append(
83
+ MultiHeadAttention(
84
+ hidden_channels,
85
+ hidden_channels,
86
+ n_heads,
87
+ p_dropout=p_dropout,
88
+ window_size=window_size,
89
+ )
90
+ )
91
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
92
+ self.ffn_layers.append(
93
+ FFN(
94
+ hidden_channels,
95
+ hidden_channels,
96
+ filter_channels,
97
+ kernel_size,
98
+ p_dropout=p_dropout,
99
+ )
100
+ )
101
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
102
+
103
+ def forward(self, x, x_mask, g=None):
104
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
105
+ x = x * x_mask
106
+ for i in range(self.n_layers):
107
+ if i == self.cond_layer_idx and g is not None:
108
+ g = self.spk_emb_linear(g.transpose(1, 2))
109
+ g = g.transpose(1, 2)
110
+ x = x + g
111
+ x = x * x_mask
112
+ y = self.attn_layers[i](x, x, attn_mask)
113
+ y = self.drop(y)
114
+ x = self.norm_layers_1[i](x + y)
115
+
116
+ y = self.ffn_layers[i](x, x_mask)
117
+ y = self.drop(y)
118
+ x = self.norm_layers_2[i](x + y)
119
+ x = x * x_mask
120
+ return x
121
+
122
+
123
+ class MultiHeadAttention(nn.Module):
124
+ def __init__(
125
+ self,
126
+ channels,
127
+ out_channels,
128
+ n_heads,
129
+ p_dropout=0.0,
130
+ window_size=None,
131
+ heads_share=True,
132
+ block_length=None,
133
+ proximal_bias=False,
134
+ proximal_init=False,
135
+ ):
136
+ super().__init__()
137
+ assert channels % n_heads == 0
138
+
139
+ self.channels = channels
140
+ self.out_channels = out_channels
141
+ self.n_heads = n_heads
142
+ self.p_dropout = p_dropout
143
+ self.window_size = window_size
144
+ self.heads_share = heads_share
145
+ self.block_length = block_length
146
+ self.proximal_bias = proximal_bias
147
+ self.proximal_init = proximal_init
148
+ self.attn = None
149
+
150
+ self.k_channels = channels // n_heads
151
+ self.conv_q = nn.Conv1d(channels, channels, 1)
152
+ self.conv_k = nn.Conv1d(channels, channels, 1)
153
+ self.conv_v = nn.Conv1d(channels, channels, 1)
154
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
155
+ self.drop = nn.Dropout(p_dropout)
156
+
157
+ if window_size is not None:
158
+ n_heads_rel = 1 if heads_share else n_heads
159
+ rel_stddev = self.k_channels**-0.5
160
+ self.emb_rel_k = nn.Parameter(
161
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
162
+ * rel_stddev
163
+ )
164
+ self.emb_rel_v = nn.Parameter(
165
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
166
+ * rel_stddev
167
+ )
168
+
169
+ nn.init.xavier_uniform_(self.conv_q.weight)
170
+ nn.init.xavier_uniform_(self.conv_k.weight)
171
+ nn.init.xavier_uniform_(self.conv_v.weight)
172
+ if proximal_init:
173
+ with torch.no_grad():
174
+ self.conv_k.weight.copy_(self.conv_q.weight)
175
+ self.conv_k.bias.copy_(self.conv_q.bias)
176
+
177
+ def forward(self, x, c, attn_mask=None):
178
+ q = self.conv_q(x)
179
+ k = self.conv_k(c)
180
+ v = self.conv_v(c)
181
+
182
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
183
+
184
+ x = self.conv_o(x)
185
+ return x
186
+
187
+ def attention(self, query, key, value, mask=None):
188
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
189
+ b, d, t_s, t_t = (*key.size(), query.size(2))
190
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
191
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
192
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
193
+
194
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
195
+ if self.window_size is not None:
196
+ assert (
197
+ t_s == t_t
198
+ ), "Relative attention is only available for self-attention."
199
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
200
+ rel_logits = self._matmul_with_relative_keys(
201
+ query / math.sqrt(self.k_channels), key_relative_embeddings
202
+ )
203
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
204
+ scores = scores + scores_local
205
+ if self.proximal_bias:
206
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
207
+ scores = scores + self._attention_bias_proximal(t_s).to(
208
+ device=scores.device, dtype=scores.dtype
209
+ )
210
+ if mask is not None:
211
+ scores = scores.masked_fill(mask == 0, -1e4)
212
+ if self.block_length is not None:
213
+ assert (
214
+ t_s == t_t
215
+ ), "Local attention is only available for self-attention."
216
+ block_mask = (
217
+ torch.ones_like(scores)
218
+ .triu(-self.block_length)
219
+ .tril(self.block_length)
220
+ )
221
+ scores = scores.masked_fill(block_mask == 0, -1e4)
222
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
223
+ p_attn = self.drop(p_attn)
224
+ output = torch.matmul(p_attn, value)
225
+ if self.window_size is not None:
226
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
227
+ value_relative_embeddings = self._get_relative_embeddings(
228
+ self.emb_rel_v, t_s
229
+ )
230
+ output = output + self._matmul_with_relative_values(
231
+ relative_weights, value_relative_embeddings
232
+ )
233
+ output = (
234
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
235
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
236
+ return output, p_attn
237
+
238
+ def _matmul_with_relative_values(self, x, y):
239
+ """
240
+ x: [b, h, l, m]
241
+ y: [h or 1, m, d]
242
+ ret: [b, h, l, d]
243
+ """
244
+ ret = torch.matmul(x, y.unsqueeze(0))
245
+ return ret
246
+
247
+ def _matmul_with_relative_keys(self, x, y):
248
+ """
249
+ x: [b, h, l, d]
250
+ y: [h or 1, m, d]
251
+ ret: [b, h, l, m]
252
+ """
253
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
254
+ return ret
255
+
256
+ def _get_relative_embeddings(self, relative_embeddings, length):
257
+ max_relative_position = 2 * self.window_size + 1
258
+ # Pad first before slice to avoid using cond ops.
259
+ pad_length = max(length - (self.window_size + 1), 0)
260
+ slice_start_position = max((self.window_size + 1) - length, 0)
261
+ slice_end_position = slice_start_position + 2 * length - 1
262
+ if pad_length > 0:
263
+ padded_relative_embeddings = F.pad(
264
+ relative_embeddings,
265
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
266
+ )
267
+ else:
268
+ padded_relative_embeddings = relative_embeddings
269
+ used_relative_embeddings = padded_relative_embeddings[
270
+ :, slice_start_position:slice_end_position
271
+ ]
272
+ return used_relative_embeddings
273
+
274
+ def _relative_position_to_absolute_position(self, x):
275
+ """
276
+ x: [b, h, l, 2*l-1]
277
+ ret: [b, h, l, l]
278
+ """
279
+ batch, heads, length, _ = x.size()
280
+ # Concat columns of pad to shift from relative to absolute indexing.
281
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
282
+
283
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
284
+ x_flat = x.view([batch, heads, length * 2 * length])
285
+ x_flat = F.pad(
286
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
287
+ )
288
+
289
+ # Reshape and slice out the padded elements.
290
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
291
+ :, :, :length, length - 1 :
292
+ ]
293
+ return x_final
294
+
295
+ def _absolute_position_to_relative_position(self, x):
296
+ """
297
+ x: [b, h, l, l]
298
+ ret: [b, h, l, 2*l-1]
299
+ """
300
+ batch, heads, length, _ = x.size()
301
+ # padd along column
302
+ x = F.pad(
303
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
304
+ )
305
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
306
+ # add 0's in the beginning that will skew the elements after reshape
307
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
308
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
309
+ return x_final
310
+
311
+ def _attention_bias_proximal(self, length):
312
+ """Bias for self-attention to encourage attention to close positions.
313
+ Args:
314
+ length: an integer scalar.
315
+ Returns:
316
+ a Tensor with shape [1, 1, length, length]
317
+ """
318
+ r = torch.arange(length, dtype=torch.float32)
319
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
320
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
321
+
322
+
323
+ class FFN(nn.Module):
324
+ def __init__(
325
+ self,
326
+ in_channels,
327
+ out_channels,
328
+ filter_channels,
329
+ kernel_size,
330
+ p_dropout=0.0,
331
+ activation=None,
332
+ causal=False,
333
+ ):
334
+ super().__init__()
335
+ self.in_channels = in_channels
336
+ self.out_channels = out_channels
337
+ self.filter_channels = filter_channels
338
+ self.kernel_size = kernel_size
339
+ self.p_dropout = p_dropout
340
+ self.activation = activation
341
+ self.causal = causal
342
+
343
+ if causal:
344
+ self.padding = self._causal_padding
345
+ else:
346
+ self.padding = self._same_padding
347
+
348
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
349
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
350
+ self.drop = nn.Dropout(p_dropout)
351
+
352
+ def forward(self, x, x_mask):
353
+ x = self.conv_1(self.padding(x * x_mask))
354
+ if self.activation == "gelu":
355
+ x = x * torch.sigmoid(1.702 * x)
356
+ else:
357
+ x = torch.relu(x)
358
+ x = self.drop(x)
359
+ x = self.conv_2(self.padding(x * x_mask))
360
+ return x * x_mask
361
+
362
+ def _causal_padding(self, x):
363
+ if self.kernel_size == 1:
364
+ return x
365
+ pad_l = self.kernel_size - 1
366
+ pad_r = 0
367
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
368
+ x = F.pad(x, commons.convert_pad_shape(padding))
369
+ return x
370
+
371
+ def _same_padding(self, x):
372
+ if self.kernel_size == 1:
373
+ return x
374
+ pad_l = (self.kernel_size - 1) // 2
375
+ pad_r = self.kernel_size // 2
376
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
377
+ x = F.pad(x, commons.convert_pad_shape(padding))
378
+ return x
onnx_modules/V200/models_onnx.py ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ import commons
7
+ import modules
8
+ from . import attentions_onnx
9
+
10
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from commons import init_weights, get_padding
13
+ from .text import symbols, num_tones, num_languages
14
+
15
+
16
+ class DurationDiscriminator(nn.Module): # vits2
17
+ def __init__(
18
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
19
+ ):
20
+ super().__init__()
21
+
22
+ self.in_channels = in_channels
23
+ self.filter_channels = filter_channels
24
+ self.kernel_size = kernel_size
25
+ self.p_dropout = p_dropout
26
+ self.gin_channels = gin_channels
27
+
28
+ self.drop = nn.Dropout(p_dropout)
29
+ self.conv_1 = nn.Conv1d(
30
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
31
+ )
32
+ self.norm_1 = modules.LayerNorm(filter_channels)
33
+ self.conv_2 = nn.Conv1d(
34
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
35
+ )
36
+ self.norm_2 = modules.LayerNorm(filter_channels)
37
+ self.dur_proj = nn.Conv1d(1, filter_channels, 1)
38
+
39
+ self.pre_out_conv_1 = nn.Conv1d(
40
+ 2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
41
+ )
42
+ self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
43
+ self.pre_out_conv_2 = nn.Conv1d(
44
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
45
+ )
46
+ self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
47
+
48
+ if gin_channels != 0:
49
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
50
+
51
+ self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
52
+
53
+ def forward_probability(self, x, x_mask, dur, g=None):
54
+ dur = self.dur_proj(dur)
55
+ x = torch.cat([x, dur], dim=1)
56
+ x = self.pre_out_conv_1(x * x_mask)
57
+ x = torch.relu(x)
58
+ x = self.pre_out_norm_1(x)
59
+ x = self.drop(x)
60
+ x = self.pre_out_conv_2(x * x_mask)
61
+ x = torch.relu(x)
62
+ x = self.pre_out_norm_2(x)
63
+ x = self.drop(x)
64
+ x = x * x_mask
65
+ x = x.transpose(1, 2)
66
+ output_prob = self.output_layer(x)
67
+ return output_prob
68
+
69
+ def forward(self, x, x_mask, dur_r, dur_hat, g=None):
70
+ x = torch.detach(x)
71
+ if g is not None:
72
+ g = torch.detach(g)
73
+ x = x + self.cond(g)
74
+ x = self.conv_1(x * x_mask)
75
+ x = torch.relu(x)
76
+ x = self.norm_1(x)
77
+ x = self.drop(x)
78
+ x = self.conv_2(x * x_mask)
79
+ x = torch.relu(x)
80
+ x = self.norm_2(x)
81
+ x = self.drop(x)
82
+
83
+ output_probs = []
84
+ for dur in [dur_r, dur_hat]:
85
+ output_prob = self.forward_probability(x, x_mask, dur, g)
86
+ output_probs.append(output_prob)
87
+
88
+ return output_probs
89
+
90
+
91
+ class TransformerCouplingBlock(nn.Module):
92
+ def __init__(
93
+ self,
94
+ channels,
95
+ hidden_channels,
96
+ filter_channels,
97
+ n_heads,
98
+ n_layers,
99
+ kernel_size,
100
+ p_dropout,
101
+ n_flows=4,
102
+ gin_channels=0,
103
+ share_parameter=False,
104
+ ):
105
+ super().__init__()
106
+ self.channels = channels
107
+ self.hidden_channels = hidden_channels
108
+ self.kernel_size = kernel_size
109
+ self.n_layers = n_layers
110
+ self.n_flows = n_flows
111
+ self.gin_channels = gin_channels
112
+
113
+ self.flows = nn.ModuleList()
114
+
115
+ self.wn = (
116
+ attentions_onnx.FFT(
117
+ hidden_channels,
118
+ filter_channels,
119
+ n_heads,
120
+ n_layers,
121
+ kernel_size,
122
+ p_dropout,
123
+ isflow=True,
124
+ gin_channels=self.gin_channels,
125
+ )
126
+ if share_parameter
127
+ else None
128
+ )
129
+
130
+ for i in range(n_flows):
131
+ self.flows.append(
132
+ modules.TransformerCouplingLayer(
133
+ channels,
134
+ hidden_channels,
135
+ kernel_size,
136
+ n_layers,
137
+ n_heads,
138
+ p_dropout,
139
+ filter_channels,
140
+ mean_only=True,
141
+ wn_sharing_parameter=self.wn,
142
+ gin_channels=self.gin_channels,
143
+ )
144
+ )
145
+ self.flows.append(modules.Flip())
146
+
147
+ def forward(self, x, x_mask, g=None, reverse=True):
148
+ if not reverse:
149
+ for flow in self.flows:
150
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
151
+ else:
152
+ for flow in reversed(self.flows):
153
+ x = flow(x, x_mask, g=g, reverse=reverse)
154
+ return x
155
+
156
+
157
+ class StochasticDurationPredictor(nn.Module):
158
+ def __init__(
159
+ self,
160
+ in_channels,
161
+ filter_channels,
162
+ kernel_size,
163
+ p_dropout,
164
+ n_flows=4,
165
+ gin_channels=0,
166
+ ):
167
+ super().__init__()
168
+ filter_channels = in_channels # it needs to be removed from future version.
169
+ self.in_channels = in_channels
170
+ self.filter_channels = filter_channels
171
+ self.kernel_size = kernel_size
172
+ self.p_dropout = p_dropout
173
+ self.n_flows = n_flows
174
+ self.gin_channels = gin_channels
175
+
176
+ self.log_flow = modules.Log()
177
+ self.flows = nn.ModuleList()
178
+ self.flows.append(modules.ElementwiseAffine(2))
179
+ for i in range(n_flows):
180
+ self.flows.append(
181
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
182
+ )
183
+ self.flows.append(modules.Flip())
184
+
185
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
186
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
187
+ self.post_convs = modules.DDSConv(
188
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
189
+ )
190
+ self.post_flows = nn.ModuleList()
191
+ self.post_flows.append(modules.ElementwiseAffine(2))
192
+ for i in range(4):
193
+ self.post_flows.append(
194
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
195
+ )
196
+ self.post_flows.append(modules.Flip())
197
+
198
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
199
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
200
+ self.convs = modules.DDSConv(
201
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
202
+ )
203
+ if gin_channels != 0:
204
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
205
+
206
+ def forward(self, x, x_mask, z, g=None):
207
+ x = torch.detach(x)
208
+ x = self.pre(x)
209
+ if g is not None:
210
+ g = torch.detach(g)
211
+ x = x + self.cond(g)
212
+ x = self.convs(x, x_mask)
213
+ x = self.proj(x) * x_mask
214
+
215
+ flows = list(reversed(self.flows))
216
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
217
+ for flow in flows:
218
+ z = flow(z, x_mask, g=x, reverse=True)
219
+ z0, z1 = torch.split(z, [1, 1], 1)
220
+ logw = z0
221
+ return logw
222
+
223
+
224
+ class DurationPredictor(nn.Module):
225
+ def __init__(
226
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
227
+ ):
228
+ super().__init__()
229
+
230
+ self.in_channels = in_channels
231
+ self.filter_channels = filter_channels
232
+ self.kernel_size = kernel_size
233
+ self.p_dropout = p_dropout
234
+ self.gin_channels = gin_channels
235
+
236
+ self.drop = nn.Dropout(p_dropout)
237
+ self.conv_1 = nn.Conv1d(
238
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
239
+ )
240
+ self.norm_1 = modules.LayerNorm(filter_channels)
241
+ self.conv_2 = nn.Conv1d(
242
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
243
+ )
244
+ self.norm_2 = modules.LayerNorm(filter_channels)
245
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
246
+
247
+ if gin_channels != 0:
248
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
249
+
250
+ def forward(self, x, x_mask, g=None):
251
+ x = torch.detach(x)
252
+ if g is not None:
253
+ g = torch.detach(g)
254
+ x = x + self.cond(g)
255
+ x = self.conv_1(x * x_mask)
256
+ x = torch.relu(x)
257
+ x = self.norm_1(x)
258
+ x = self.drop(x)
259
+ x = self.conv_2(x * x_mask)
260
+ x = torch.relu(x)
261
+ x = self.norm_2(x)
262
+ x = self.drop(x)
263
+ x = self.proj(x * x_mask)
264
+ return x * x_mask
265
+
266
+
267
+ class TextEncoder(nn.Module):
268
+ def __init__(
269
+ self,
270
+ n_vocab,
271
+ out_channels,
272
+ hidden_channels,
273
+ filter_channels,
274
+ n_heads,
275
+ n_layers,
276
+ kernel_size,
277
+ p_dropout,
278
+ gin_channels=0,
279
+ ):
280
+ super().__init__()
281
+ self.n_vocab = n_vocab
282
+ self.out_channels = out_channels
283
+ self.hidden_channels = hidden_channels
284
+ self.filter_channels = filter_channels
285
+ self.n_heads = n_heads
286
+ self.n_layers = n_layers
287
+ self.kernel_size = kernel_size
288
+ self.p_dropout = p_dropout
289
+ self.gin_channels = gin_channels
290
+ self.emb = nn.Embedding(len(symbols), hidden_channels)
291
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
292
+ self.tone_emb = nn.Embedding(num_tones, hidden_channels)
293
+ nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5)
294
+ self.language_emb = nn.Embedding(num_languages, hidden_channels)
295
+ nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
296
+ self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
297
+ self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
298
+ self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
299
+
300
+ self.encoder = attentions_onnx.Encoder(
301
+ hidden_channels,
302
+ filter_channels,
303
+ n_heads,
304
+ n_layers,
305
+ kernel_size,
306
+ p_dropout,
307
+ gin_channels=self.gin_channels,
308
+ )
309
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
310
+
311
+ def forward(self, x, x_lengths, tone, language, bert, ja_bert, en_bert, g=None):
312
+ x_mask = torch.ones_like(x).unsqueeze(0)
313
+ bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2)
314
+ ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose(
315
+ 1, 2
316
+ )
317
+ en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose(
318
+ 1, 2
319
+ )
320
+ x = (
321
+ self.emb(x)
322
+ + self.tone_emb(tone)
323
+ + self.language_emb(language)
324
+ + bert_emb
325
+ + ja_bert_emb
326
+ + en_bert_emb
327
+ ) * math.sqrt(
328
+ self.hidden_channels
329
+ ) # [b, t, h]
330
+ x = torch.transpose(x, 1, -1) # [b, h, t]
331
+ x_mask = x_mask.to(x.dtype)
332
+
333
+ x = self.encoder(x * x_mask, x_mask, g=g)
334
+ stats = self.proj(x) * x_mask
335
+
336
+ m, logs = torch.split(stats, self.out_channels, dim=1)
337
+ return x, m, logs, x_mask
338
+
339
+
340
+ class ResidualCouplingBlock(nn.Module):
341
+ def __init__(
342
+ self,
343
+ channels,
344
+ hidden_channels,
345
+ kernel_size,
346
+ dilation_rate,
347
+ n_layers,
348
+ n_flows=4,
349
+ gin_channels=0,
350
+ ):
351
+ super().__init__()
352
+ self.channels = channels
353
+ self.hidden_channels = hidden_channels
354
+ self.kernel_size = kernel_size
355
+ self.dilation_rate = dilation_rate
356
+ self.n_layers = n_layers
357
+ self.n_flows = n_flows
358
+ self.gin_channels = gin_channels
359
+
360
+ self.flows = nn.ModuleList()
361
+ for i in range(n_flows):
362
+ self.flows.append(
363
+ modules.ResidualCouplingLayer(
364
+ channels,
365
+ hidden_channels,
366
+ kernel_size,
367
+ dilation_rate,
368
+ n_layers,
369
+ gin_channels=gin_channels,
370
+ mean_only=True,
371
+ )
372
+ )
373
+ self.flows.append(modules.Flip())
374
+
375
+ def forward(self, x, x_mask, g=None, reverse=True):
376
+ if not reverse:
377
+ for flow in self.flows:
378
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
379
+ else:
380
+ for flow in reversed(self.flows):
381
+ x = flow(x, x_mask, g=g, reverse=reverse)
382
+ return x
383
+
384
+
385
+ class PosteriorEncoder(nn.Module):
386
+ def __init__(
387
+ self,
388
+ in_channels,
389
+ out_channels,
390
+ hidden_channels,
391
+ kernel_size,
392
+ dilation_rate,
393
+ n_layers,
394
+ gin_channels=0,
395
+ ):
396
+ super().__init__()
397
+ self.in_channels = in_channels
398
+ self.out_channels = out_channels
399
+ self.hidden_channels = hidden_channels
400
+ self.kernel_size = kernel_size
401
+ self.dilation_rate = dilation_rate
402
+ self.n_layers = n_layers
403
+ self.gin_channels = gin_channels
404
+
405
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
406
+ self.enc = modules.WN(
407
+ hidden_channels,
408
+ kernel_size,
409
+ dilation_rate,
410
+ n_layers,
411
+ gin_channels=gin_channels,
412
+ )
413
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
414
+
415
+ def forward(self, x, x_lengths, g=None):
416
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
417
+ x.dtype
418
+ )
419
+ x = self.pre(x) * x_mask
420
+ x = self.enc(x, x_mask, g=g)
421
+ stats = self.proj(x) * x_mask
422
+ m, logs = torch.split(stats, self.out_channels, dim=1)
423
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
424
+ return z, m, logs, x_mask
425
+
426
+
427
+ class Generator(torch.nn.Module):
428
+ def __init__(
429
+ self,
430
+ initial_channel,
431
+ resblock,
432
+ resblock_kernel_sizes,
433
+ resblock_dilation_sizes,
434
+ upsample_rates,
435
+ upsample_initial_channel,
436
+ upsample_kernel_sizes,
437
+ gin_channels=0,
438
+ ):
439
+ super(Generator, self).__init__()
440
+ self.num_kernels = len(resblock_kernel_sizes)
441
+ self.num_upsamples = len(upsample_rates)
442
+ self.conv_pre = Conv1d(
443
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
444
+ )
445
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
446
+
447
+ self.ups = nn.ModuleList()
448
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
449
+ self.ups.append(
450
+ weight_norm(
451
+ ConvTranspose1d(
452
+ upsample_initial_channel // (2**i),
453
+ upsample_initial_channel // (2 ** (i + 1)),
454
+ k,
455
+ u,
456
+ padding=(k - u) // 2,
457
+ )
458
+ )
459
+ )
460
+
461
+ self.resblocks = nn.ModuleList()
462
+ for i in range(len(self.ups)):
463
+ ch = upsample_initial_channel // (2 ** (i + 1))
464
+ for j, (k, d) in enumerate(
465
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
466
+ ):
467
+ self.resblocks.append(resblock(ch, k, d))
468
+
469
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
470
+ self.ups.apply(init_weights)
471
+
472
+ if gin_channels != 0:
473
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
474
+
475
+ def forward(self, x, g=None):
476
+ x = self.conv_pre(x)
477
+ if g is not None:
478
+ x = x + self.cond(g)
479
+
480
+ for i in range(self.num_upsamples):
481
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
482
+ x = self.ups[i](x)
483
+ xs = None
484
+ for j in range(self.num_kernels):
485
+ if xs is None:
486
+ xs = self.resblocks[i * self.num_kernels + j](x)
487
+ else:
488
+ xs += self.resblocks[i * self.num_kernels + j](x)
489
+ x = xs / self.num_kernels
490
+ x = F.leaky_relu(x)
491
+ x = self.conv_post(x)
492
+ x = torch.tanh(x)
493
+
494
+ return x
495
+
496
+ def remove_weight_norm(self):
497
+ print("Removing weight norm...")
498
+ for layer in self.ups:
499
+ remove_weight_norm(layer)
500
+ for layer in self.resblocks:
501
+ layer.remove_weight_norm()
502
+
503
+
504
+ class DiscriminatorP(torch.nn.Module):
505
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
506
+ super(DiscriminatorP, self).__init__()
507
+ self.period = period
508
+ self.use_spectral_norm = use_spectral_norm
509
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
510
+ self.convs = nn.ModuleList(
511
+ [
512
+ norm_f(
513
+ Conv2d(
514
+ 1,
515
+ 32,
516
+ (kernel_size, 1),
517
+ (stride, 1),
518
+ padding=(get_padding(kernel_size, 1), 0),
519
+ )
520
+ ),
521
+ norm_f(
522
+ Conv2d(
523
+ 32,
524
+ 128,
525
+ (kernel_size, 1),
526
+ (stride, 1),
527
+ padding=(get_padding(kernel_size, 1), 0),
528
+ )
529
+ ),
530
+ norm_f(
531
+ Conv2d(
532
+ 128,
533
+ 512,
534
+ (kernel_size, 1),
535
+ (stride, 1),
536
+ padding=(get_padding(kernel_size, 1), 0),
537
+ )
538
+ ),
539
+ norm_f(
540
+ Conv2d(
541
+ 512,
542
+ 1024,
543
+ (kernel_size, 1),
544
+ (stride, 1),
545
+ padding=(get_padding(kernel_size, 1), 0),
546
+ )
547
+ ),
548
+ norm_f(
549
+ Conv2d(
550
+ 1024,
551
+ 1024,
552
+ (kernel_size, 1),
553
+ 1,
554
+ padding=(get_padding(kernel_size, 1), 0),
555
+ )
556
+ ),
557
+ ]
558
+ )
559
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
560
+
561
+ def forward(self, x):
562
+ fmap = []
563
+
564
+ # 1d to 2d
565
+ b, c, t = x.shape
566
+ if t % self.period != 0: # pad first
567
+ n_pad = self.period - (t % self.period)
568
+ x = F.pad(x, (0, n_pad), "reflect")
569
+ t = t + n_pad
570
+ x = x.view(b, c, t // self.period, self.period)
571
+
572
+ for layer in self.convs:
573
+ x = layer(x)
574
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
575
+ fmap.append(x)
576
+ x = self.conv_post(x)
577
+ fmap.append(x)
578
+ x = torch.flatten(x, 1, -1)
579
+
580
+ return x, fmap
581
+
582
+
583
+ class DiscriminatorS(torch.nn.Module):
584
+ def __init__(self, use_spectral_norm=False):
585
+ super(DiscriminatorS, self).__init__()
586
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
587
+ self.convs = nn.ModuleList(
588
+ [
589
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
590
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
591
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
592
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
593
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
594
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
595
+ ]
596
+ )
597
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
598
+
599
+ def forward(self, x):
600
+ fmap = []
601
+
602
+ for layer in self.convs:
603
+ x = layer(x)
604
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
605
+ fmap.append(x)
606
+ x = self.conv_post(x)
607
+ fmap.append(x)
608
+ x = torch.flatten(x, 1, -1)
609
+
610
+ return x, fmap
611
+
612
+
613
+ class MultiPeriodDiscriminator(torch.nn.Module):
614
+ def __init__(self, use_spectral_norm=False):
615
+ super(MultiPeriodDiscriminator, self).__init__()
616
+ periods = [2, 3, 5, 7, 11]
617
+
618
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
619
+ discs = discs + [
620
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
621
+ ]
622
+ self.discriminators = nn.ModuleList(discs)
623
+
624
+ def forward(self, y, y_hat):
625
+ y_d_rs = []
626
+ y_d_gs = []
627
+ fmap_rs = []
628
+ fmap_gs = []
629
+ for i, d in enumerate(self.discriminators):
630
+ y_d_r, fmap_r = d(y)
631
+ y_d_g, fmap_g = d(y_hat)
632
+ y_d_rs.append(y_d_r)
633
+ y_d_gs.append(y_d_g)
634
+ fmap_rs.append(fmap_r)
635
+ fmap_gs.append(fmap_g)
636
+
637
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
638
+
639
+
640
+ class ReferenceEncoder(nn.Module):
641
+ """
642
+ inputs --- [N, Ty/r, n_mels*r] mels
643
+ outputs --- [N, ref_enc_gru_size]
644
+ """
645
+
646
+ def __init__(self, spec_channels, gin_channels=0):
647
+ super().__init__()
648
+ self.spec_channels = spec_channels
649
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
650
+ K = len(ref_enc_filters)
651
+ filters = [1] + ref_enc_filters
652
+ convs = [
653
+ weight_norm(
654
+ nn.Conv2d(
655
+ in_channels=filters[i],
656
+ out_channels=filters[i + 1],
657
+ kernel_size=(3, 3),
658
+ stride=(2, 2),
659
+ padding=(1, 1),
660
+ )
661
+ )
662
+ for i in range(K)
663
+ ]
664
+ self.convs = nn.ModuleList(convs)
665
+ # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
666
+
667
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
668
+ self.gru = nn.GRU(
669
+ input_size=ref_enc_filters[-1] * out_channels,
670
+ hidden_size=256 // 2,
671
+ batch_first=True,
672
+ )
673
+ self.proj = nn.Linear(128, gin_channels)
674
+
675
+ def forward(self, inputs, mask=None):
676
+ N = inputs.size(0)
677
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
678
+ for conv in self.convs:
679
+ out = conv(out)
680
+ # out = wn(out)
681
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
682
+
683
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
684
+ T = out.size(1)
685
+ N = out.size(0)
686
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
687
+
688
+ self.gru.flatten_parameters()
689
+ memory, out = self.gru(out) # out --- [1, N, 128]
690
+
691
+ return self.proj(out.squeeze(0))
692
+
693
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
694
+ for i in range(n_convs):
695
+ L = (L - kernel_size + 2 * pad) // stride + 1
696
+ return L
697
+
698
+
699
+ class SynthesizerTrn(nn.Module):
700
+ """
701
+ Synthesizer for Training
702
+ """
703
+
704
+ def __init__(
705
+ self,
706
+ n_vocab,
707
+ spec_channels,
708
+ segment_size,
709
+ inter_channels,
710
+ hidden_channels,
711
+ filter_channels,
712
+ n_heads,
713
+ n_layers,
714
+ kernel_size,
715
+ p_dropout,
716
+ resblock,
717
+ resblock_kernel_sizes,
718
+ resblock_dilation_sizes,
719
+ upsample_rates,
720
+ upsample_initial_channel,
721
+ upsample_kernel_sizes,
722
+ n_speakers=256,
723
+ gin_channels=256,
724
+ use_sdp=True,
725
+ n_flow_layer=4,
726
+ n_layers_trans_flow=4,
727
+ flow_share_parameter=False,
728
+ use_transformer_flow=True,
729
+ **kwargs,
730
+ ):
731
+ super().__init__()
732
+ self.n_vocab = n_vocab
733
+ self.spec_channels = spec_channels
734
+ self.inter_channels = inter_channels
735
+ self.hidden_channels = hidden_channels
736
+ self.filter_channels = filter_channels
737
+ self.n_heads = n_heads
738
+ self.n_layers = n_layers
739
+ self.kernel_size = kernel_size
740
+ self.p_dropout = p_dropout
741
+ self.resblock = resblock
742
+ self.resblock_kernel_sizes = resblock_kernel_sizes
743
+ self.resblock_dilation_sizes = resblock_dilation_sizes
744
+ self.upsample_rates = upsample_rates
745
+ self.upsample_initial_channel = upsample_initial_channel
746
+ self.upsample_kernel_sizes = upsample_kernel_sizes
747
+ self.segment_size = segment_size
748
+ self.n_speakers = n_speakers
749
+ self.gin_channels = gin_channels
750
+ self.n_layers_trans_flow = n_layers_trans_flow
751
+ self.use_spk_conditioned_encoder = kwargs.get(
752
+ "use_spk_conditioned_encoder", True
753
+ )
754
+ self.use_sdp = use_sdp
755
+ self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
756
+ self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
757
+ self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
758
+ self.current_mas_noise_scale = self.mas_noise_scale_initial
759
+ if self.use_spk_conditioned_encoder and gin_channels > 0:
760
+ self.enc_gin_channels = gin_channels
761
+ self.enc_p = TextEncoder(
762
+ n_vocab,
763
+ inter_channels,
764
+ hidden_channels,
765
+ filter_channels,
766
+ n_heads,
767
+ n_layers,
768
+ kernel_size,
769
+ p_dropout,
770
+ gin_channels=self.enc_gin_channels,
771
+ )
772
+ self.dec = Generator(
773
+ inter_channels,
774
+ resblock,
775
+ resblock_kernel_sizes,
776
+ resblock_dilation_sizes,
777
+ upsample_rates,
778
+ upsample_initial_channel,
779
+ upsample_kernel_sizes,
780
+ gin_channels=gin_channels,
781
+ )
782
+ self.enc_q = PosteriorEncoder(
783
+ spec_channels,
784
+ inter_channels,
785
+ hidden_channels,
786
+ 5,
787
+ 1,
788
+ 16,
789
+ gin_channels=gin_channels,
790
+ )
791
+ if use_transformer_flow:
792
+ self.flow = TransformerCouplingBlock(
793
+ inter_channels,
794
+ hidden_channels,
795
+ filter_channels,
796
+ n_heads,
797
+ n_layers_trans_flow,
798
+ 5,
799
+ p_dropout,
800
+ n_flow_layer,
801
+ gin_channels=gin_channels,
802
+ share_parameter=flow_share_parameter,
803
+ )
804
+ else:
805
+ self.flow = ResidualCouplingBlock(
806
+ inter_channels,
807
+ hidden_channels,
808
+ 5,
809
+ 1,
810
+ n_flow_layer,
811
+ gin_channels=gin_channels,
812
+ )
813
+ self.sdp = StochasticDurationPredictor(
814
+ hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
815
+ )
816
+ self.dp = DurationPredictor(
817
+ hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
818
+ )
819
+
820
+ if n_speakers >= 1:
821
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
822
+ else:
823
+ self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
824
+
825
+ def export_onnx(
826
+ self,
827
+ path,
828
+ max_len=None,
829
+ sdp_ratio=0,
830
+ y=None,
831
+ ):
832
+ noise_scale = 0.667
833
+ length_scale = 1
834
+ noise_scale_w = 0.8
835
+ x = (
836
+ torch.LongTensor(
837
+ [
838
+ 0,
839
+ 97,
840
+ 0,
841
+ 8,
842
+ 0,
843
+ 78,
844
+ 0,
845
+ 8,
846
+ 0,
847
+ 76,
848
+ 0,
849
+ 37,
850
+ 0,
851
+ 40,
852
+ 0,
853
+ 97,
854
+ 0,
855
+ 8,
856
+ 0,
857
+ 23,
858
+ 0,
859
+ 8,
860
+ 0,
861
+ 74,
862
+ 0,
863
+ 26,
864
+ 0,
865
+ 104,
866
+ 0,
867
+ ]
868
+ )
869
+ .unsqueeze(0)
870
+ .cpu()
871
+ )
872
+ tone = torch.zeros_like(x).cpu()
873
+ language = torch.zeros_like(x).cpu()
874
+ x_lengths = torch.LongTensor([x.shape[1]]).cpu()
875
+ sid = torch.LongTensor([0]).cpu()
876
+ bert = torch.randn(size=(x.shape[1], 1024)).cpu()
877
+ ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
878
+ en_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
879
+
880
+ if self.n_speakers > 0:
881
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
882
+ torch.onnx.export(
883
+ self.emb_g,
884
+ (sid),
885
+ f"onnx/{path}/{path}_emb.onnx",
886
+ input_names=["sid"],
887
+ output_names=["g"],
888
+ verbose=True,
889
+ )
890
+ else:
891
+ g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
892
+
893
+ torch.onnx.export(
894
+ self.enc_p,
895
+ (x, x_lengths, tone, language, bert, ja_bert, en_bert, g),
896
+ f"onnx/{path}/{path}_enc_p.onnx",
897
+ input_names=[
898
+ "x",
899
+ "x_lengths",
900
+ "t",
901
+ "language",
902
+ "bert_0",
903
+ "bert_1",
904
+ "bert_2",
905
+ "g",
906
+ ],
907
+ output_names=["xout", "m_p", "logs_p", "x_mask"],
908
+ dynamic_axes={
909
+ "x": [0, 1],
910
+ "t": [0, 1],
911
+ "language": [0, 1],
912
+ "bert_0": [0],
913
+ "bert_1": [0],
914
+ "bert_2": [0],
915
+ "xout": [0, 2],
916
+ "m_p": [0, 2],
917
+ "logs_p": [0, 2],
918
+ "x_mask": [0, 2],
919
+ },
920
+ verbose=True,
921
+ opset_version=16,
922
+ )
923
+ x, m_p, logs_p, x_mask = self.enc_p(
924
+ x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g
925
+ )
926
+ zinput = (
927
+ torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
928
+ * noise_scale_w
929
+ )
930
+ torch.onnx.export(
931
+ self.sdp,
932
+ (x, x_mask, zinput, g),
933
+ f"onnx/{path}/{path}_sdp.onnx",
934
+ input_names=["x", "x_mask", "zin", "g"],
935
+ output_names=["logw"],
936
+ dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]},
937
+ verbose=True,
938
+ )
939
+ torch.onnx.export(
940
+ self.dp,
941
+ (x, x_mask, g),
942
+ f"onnx/{path}/{path}_dp.onnx",
943
+ input_names=["x", "x_mask", "g"],
944
+ output_names=["logw"],
945
+ dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]},
946
+ verbose=True,
947
+ )
948
+ logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp(
949
+ x, x_mask, g=g
950
+ ) * (1 - sdp_ratio)
951
+ w = torch.exp(logw) * x_mask * length_scale
952
+ w_ceil = torch.ceil(w)
953
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
954
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
955
+ x_mask.dtype
956
+ )
957
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
958
+ attn = commons.generate_path(w_ceil, attn_mask)
959
+
960
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
961
+ 1, 2
962
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
963
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
964
+ 1, 2
965
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
966
+
967
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
968
+ torch.onnx.export(
969
+ self.flow,
970
+ (z_p, y_mask, g),
971
+ f"onnx/{path}/{path}_flow.onnx",
972
+ input_names=["z_p", "y_mask", "g"],
973
+ output_names=["z"],
974
+ dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]},
975
+ verbose=True,
976
+ )
977
+
978
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
979
+ z_in = (z * y_mask)[:, :, :max_len]
980
+
981
+ torch.onnx.export(
982
+ self.dec,
983
+ (z_in, g),
984
+ f"onnx/{path}/{path}_dec.onnx",
985
+ input_names=["z_in", "g"],
986
+ output_names=["o"],
987
+ dynamic_axes={"z_in": [0, 2], "o": [0, 2]},
988
+ verbose=True,
989
+ )
990
+ o = self.dec((z * y_mask)[:, :, :max_len], g=g)
onnx_modules/V200/text/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .symbols import *
onnx_modules/V200/text/bert_utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ from config import config
6
+
7
+
8
+ MIRROR: str = config.mirror
9
+
10
+
11
+ def _check_bert(repo_id, files, local_path):
12
+ for file in files:
13
+ if not Path(local_path).joinpath(file).exists():
14
+ if MIRROR.lower() == "openi":
15
+ import openi
16
+
17
+ openi.model.download_model(
18
+ "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19
+ )
20
+ else:
21
+ hf_hub_download(
22
+ repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23
+ )
onnx_modules/V200/text/chinese.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import cn2an
5
+ from pypinyin import lazy_pinyin, Style
6
+
7
+ from .symbols import punctuation
8
+ from .tone_sandhi import ToneSandhi
9
+
10
+ current_file_path = os.path.dirname(__file__)
11
+ pinyin_to_symbol_map = {
12
+ line.split("\t")[0]: line.strip().split("\t")[1]
13
+ for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14
+ }
15
+
16
+ import jieba.posseg as psg
17
+
18
+
19
+ rep_map = {
20
+ ":": ",",
21
+ ";": ",",
22
+ ",": ",",
23
+ "。": ".",
24
+ "!": "!",
25
+ "?": "?",
26
+ "\n": ".",
27
+ "·": ",",
28
+ "、": ",",
29
+ "...": "…",
30
+ "$": ".",
31
+ "“": "'",
32
+ "”": "'",
33
+ "‘": "'",
34
+ "’": "'",
35
+ "(": "'",
36
+ ")": "'",
37
+ "(": "'",
38
+ ")": "'",
39
+ "《": "'",
40
+ "》": "'",
41
+ "【": "'",
42
+ "】": "'",
43
+ "[": "'",
44
+ "]": "'",
45
+ "—": "-",
46
+ "~": "-",
47
+ "~": "-",
48
+ "「": "'",
49
+ "」": "'",
50
+ }
51
+
52
+ tone_modifier = ToneSandhi()
53
+
54
+
55
+ def replace_punctuation(text):
56
+ text = text.replace("嗯", "恩").replace("呣", "母")
57
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
58
+
59
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
60
+
61
+ replaced_text = re.sub(
62
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
63
+ )
64
+
65
+ return replaced_text
66
+
67
+
68
+ def g2p(text):
69
+ pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
70
+ sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
71
+ phones, tones, word2ph = _g2p(sentences)
72
+ assert sum(word2ph) == len(phones)
73
+ assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
74
+ phones = ["_"] + phones + ["_"]
75
+ tones = [0] + tones + [0]
76
+ word2ph = [1] + word2ph + [1]
77
+ return phones, tones, word2ph
78
+
79
+
80
+ def _get_initials_finals(word):
81
+ initials = []
82
+ finals = []
83
+ orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
84
+ orig_finals = lazy_pinyin(
85
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
86
+ )
87
+ for c, v in zip(orig_initials, orig_finals):
88
+ initials.append(c)
89
+ finals.append(v)
90
+ return initials, finals
91
+
92
+
93
+ def _g2p(segments):
94
+ phones_list = []
95
+ tones_list = []
96
+ word2ph = []
97
+ for seg in segments:
98
+ # Replace all English words in the sentence
99
+ seg = re.sub("[a-zA-Z]+", "", seg)
100
+ seg_cut = psg.lcut(seg)
101
+ initials = []
102
+ finals = []
103
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104
+ for word, pos in seg_cut:
105
+ if pos == "eng":
106
+ continue
107
+ sub_initials, sub_finals = _get_initials_finals(word)
108
+ sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
109
+ initials.append(sub_initials)
110
+ finals.append(sub_finals)
111
+
112
+ # assert len(sub_initials) == len(sub_finals) == len(word)
113
+ initials = sum(initials, [])
114
+ finals = sum(finals, [])
115
+ #
116
+ for c, v in zip(initials, finals):
117
+ raw_pinyin = c + v
118
+ # NOTE: post process for pypinyin outputs
119
+ # we discriminate i, ii and iii
120
+ if c == v:
121
+ assert c in punctuation
122
+ phone = [c]
123
+ tone = "0"
124
+ word2ph.append(1)
125
+ else:
126
+ v_without_tone = v[:-1]
127
+ tone = v[-1]
128
+
129
+ pinyin = c + v_without_tone
130
+ assert tone in "12345"
131
+
132
+ if c:
133
+ # 多音节
134
+ v_rep_map = {
135
+ "uei": "ui",
136
+ "iou": "iu",
137
+ "uen": "un",
138
+ }
139
+ if v_without_tone in v_rep_map.keys():
140
+ pinyin = c + v_rep_map[v_without_tone]
141
+ else:
142
+ # 单音节
143
+ pinyin_rep_map = {
144
+ "ing": "ying",
145
+ "i": "yi",
146
+ "in": "yin",
147
+ "u": "wu",
148
+ }
149
+ if pinyin in pinyin_rep_map.keys():
150
+ pinyin = pinyin_rep_map[pinyin]
151
+ else:
152
+ single_rep_map = {
153
+ "v": "yu",
154
+ "e": "e",
155
+ "i": "y",
156
+ "u": "w",
157
+ }
158
+ if pinyin[0] in single_rep_map.keys():
159
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160
+
161
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162
+ phone = pinyin_to_symbol_map[pinyin].split(" ")
163
+ word2ph.append(len(phone))
164
+
165
+ phones_list += phone
166
+ tones_list += [int(tone)] * len(phone)
167
+ return phones_list, tones_list, word2ph
168
+
169
+
170
+ def text_normalize(text):
171
+ numbers = re.findall(r"\d+(?:\.?\d+)?", text)
172
+ for number in numbers:
173
+ text = text.replace(number, cn2an.an2cn(number), 1)
174
+ text = replace_punctuation(text)
175
+ return text
176
+
177
+
178
+ def get_bert_feature(text, word2ph):
179
+ from text import chinese_bert
180
+
181
+ return chinese_bert.get_bert_feature(text, word2ph)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ from text.chinese_bert import get_bert_feature
186
+
187
+ text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
188
+ text = text_normalize(text)
189
+ print(text)
190
+ phones, tones, word2ph = g2p(text)
191
+ bert = get_bert_feature(text, word2ph)
192
+
193
+ print(phones, tones, word2ph, bert.shape)
194
+
195
+
196
+ # # 示例用法
197
+ # text = "这是一个示例文本:,你好!这是一个测试...."
198
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
onnx_modules/V200/text/chinese_bert.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
5
+
6
+ from config import config
7
+
8
+ LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11
+
12
+ models = dict()
13
+
14
+
15
+ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
16
+ if (
17
+ sys.platform == "darwin"
18
+ and torch.backends.mps.is_available()
19
+ and device == "cpu"
20
+ ):
21
+ device = "mps"
22
+ if not device:
23
+ device = "cuda"
24
+ if device not in models.keys():
25
+ models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
26
+ with torch.no_grad():
27
+ inputs = tokenizer(text, return_tensors="pt")
28
+ for i in inputs:
29
+ inputs[i] = inputs[i].to(device)
30
+ res = models[device](**inputs, output_hidden_states=True)
31
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
32
+
33
+ assert len(word2ph) == len(text) + 2
34
+ word2phone = word2ph
35
+ phone_level_feature = []
36
+ for i in range(len(word2phone)):
37
+ repeat_feature = res[i].repeat(word2phone[i], 1)
38
+ phone_level_feature.append(repeat_feature)
39
+
40
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
41
+
42
+ return phone_level_feature.T
43
+
44
+
45
+ if __name__ == "__main__":
46
+ word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
47
+ word2phone = [
48
+ 1,
49
+ 2,
50
+ 1,
51
+ 2,
52
+ 2,
53
+ 1,
54
+ 2,
55
+ 2,
56
+ 1,
57
+ 2,
58
+ 2,
59
+ 1,
60
+ 2,
61
+ 2,
62
+ 2,
63
+ 2,
64
+ 2,
65
+ 1,
66
+ 1,
67
+ 2,
68
+ 2,
69
+ 1,
70
+ 2,
71
+ 2,
72
+ 2,
73
+ 2,
74
+ 1,
75
+ 2,
76
+ 2,
77
+ 2,
78
+ 2,
79
+ 2,
80
+ 1,
81
+ 2,
82
+ 2,
83
+ 2,
84
+ 2,
85
+ 1,
86
+ ]
87
+
88
+ # 计算总帧数
89
+ total_frames = sum(word2phone)
90
+ print(word_level_feature.shape)
91
+ print(word2phone)
92
+ phone_level_feature = []
93
+ for i in range(len(word2phone)):
94
+ print(word_level_feature[i].shape)
95
+
96
+ # 对每个词重复word2phone[i]次
97
+ repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
98
+ phone_level_feature.append(repeat_feature)
99
+
100
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
101
+ print(phone_level_feature.shape) # torch.Size([36, 1024])
onnx_modules/V200/text/cleaner.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from . import chinese, japanese, english, cleaned_text_to_sequence
2
+
3
+
4
+ language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
5
+
6
+
7
+ def clean_text(text, language):
8
+ language_module = language_module_map[language]
9
+ norm_text = language_module.text_normalize(text)
10
+ phones, tones, word2ph = language_module.g2p(norm_text)
11
+ return norm_text, phones, tones, word2ph
12
+
13
+
14
+ def clean_text_bert(text, language):
15
+ language_module = language_module_map[language]
16
+ norm_text = language_module.text_normalize(text)
17
+ phones, tones, word2ph = language_module.g2p(norm_text)
18
+ bert = language_module.get_bert_feature(norm_text, word2ph)
19
+ return phones, tones, bert
20
+
21
+
22
+ def text_to_sequence(text, language):
23
+ norm_text, phones, tones, word2ph = clean_text(text, language)
24
+ return cleaned_text_to_sequence(phones, tones, language)
25
+
26
+
27
+ if __name__ == "__main__":
28
+ pass
onnx_modules/V200/text/english.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import re
4
+ from g2p_en import G2p
5
+
6
+ from . import symbols
7
+
8
+ current_file_path = os.path.dirname(__file__)
9
+ CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
10
+ CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
11
+ _g2p = G2p()
12
+
13
+ arpa = {
14
+ "AH0",
15
+ "S",
16
+ "AH1",
17
+ "EY2",
18
+ "AE2",
19
+ "EH0",
20
+ "OW2",
21
+ "UH0",
22
+ "NG",
23
+ "B",
24
+ "G",
25
+ "AY0",
26
+ "M",
27
+ "AA0",
28
+ "F",
29
+ "AO0",
30
+ "ER2",
31
+ "UH1",
32
+ "IY1",
33
+ "AH2",
34
+ "DH",
35
+ "IY0",
36
+ "EY1",
37
+ "IH0",
38
+ "K",
39
+ "N",
40
+ "W",
41
+ "IY2",
42
+ "T",
43
+ "AA1",
44
+ "ER1",
45
+ "EH2",
46
+ "OY0",
47
+ "UH2",
48
+ "UW1",
49
+ "Z",
50
+ "AW2",
51
+ "AW1",
52
+ "V",
53
+ "UW2",
54
+ "AA2",
55
+ "ER",
56
+ "AW0",
57
+ "UW0",
58
+ "R",
59
+ "OW1",
60
+ "EH1",
61
+ "ZH",
62
+ "AE0",
63
+ "IH2",
64
+ "IH",
65
+ "Y",
66
+ "JH",
67
+ "P",
68
+ "AY1",
69
+ "EY0",
70
+ "OY2",
71
+ "TH",
72
+ "HH",
73
+ "D",
74
+ "ER0",
75
+ "CH",
76
+ "AO1",
77
+ "AE1",
78
+ "AO2",
79
+ "OY1",
80
+ "AY2",
81
+ "IH1",
82
+ "OW0",
83
+ "L",
84
+ "SH",
85
+ }
86
+
87
+
88
+ def post_replace_ph(ph):
89
+ rep_map = {
90
+ ":": ",",
91
+ ";": ",",
92
+ ",": ",",
93
+ "。": ".",
94
+ "!": "!",
95
+ "?": "?",
96
+ "\n": ".",
97
+ "·": ",",
98
+ "、": ",",
99
+ "...": "…",
100
+ "v": "V",
101
+ }
102
+ if ph in rep_map.keys():
103
+ ph = rep_map[ph]
104
+ if ph in symbols:
105
+ return ph
106
+ if ph not in symbols:
107
+ ph = "UNK"
108
+ return ph
109
+
110
+
111
+ def read_dict():
112
+ g2p_dict = {}
113
+ start_line = 49
114
+ with open(CMU_DICT_PATH) as f:
115
+ line = f.readline()
116
+ line_index = 1
117
+ while line:
118
+ if line_index >= start_line:
119
+ line = line.strip()
120
+ word_split = line.split(" ")
121
+ word = word_split[0]
122
+
123
+ syllable_split = word_split[1].split(" - ")
124
+ g2p_dict[word] = []
125
+ for syllable in syllable_split:
126
+ phone_split = syllable.split(" ")
127
+ g2p_dict[word].append(phone_split)
128
+
129
+ line_index = line_index + 1
130
+ line = f.readline()
131
+
132
+ return g2p_dict
133
+
134
+
135
+ def cache_dict(g2p_dict, file_path):
136
+ with open(file_path, "wb") as pickle_file:
137
+ pickle.dump(g2p_dict, pickle_file)
138
+
139
+
140
+ def get_dict():
141
+ if os.path.exists(CACHE_PATH):
142
+ with open(CACHE_PATH, "rb") as pickle_file:
143
+ g2p_dict = pickle.load(pickle_file)
144
+ else:
145
+ g2p_dict = read_dict()
146
+ cache_dict(g2p_dict, CACHE_PATH)
147
+
148
+ return g2p_dict
149
+
150
+
151
+ eng_dict = get_dict()
152
+
153
+
154
+ def refine_ph(phn):
155
+ tone = 0
156
+ if re.search(r"\d$", phn):
157
+ tone = int(phn[-1]) + 1
158
+ phn = phn[:-1]
159
+ return phn.lower(), tone
160
+
161
+
162
+ def refine_syllables(syllables):
163
+ tones = []
164
+ phonemes = []
165
+ for phn_list in syllables:
166
+ for i in range(len(phn_list)):
167
+ phn = phn_list[i]
168
+ phn, tone = refine_ph(phn)
169
+ phonemes.append(phn)
170
+ tones.append(tone)
171
+ return phonemes, tones
172
+
173
+
174
+ import re
175
+ import inflect
176
+
177
+ _inflect = inflect.engine()
178
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
179
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
180
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
181
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
182
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
183
+ _number_re = re.compile(r"[0-9]+")
184
+
185
+ # List of (regular expression, replacement) pairs for abbreviations:
186
+ _abbreviations = [
187
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
188
+ for x in [
189
+ ("mrs", "misess"),
190
+ ("mr", "mister"),
191
+ ("dr", "doctor"),
192
+ ("st", "saint"),
193
+ ("co", "company"),
194
+ ("jr", "junior"),
195
+ ("maj", "major"),
196
+ ("gen", "general"),
197
+ ("drs", "doctors"),
198
+ ("rev", "reverend"),
199
+ ("lt", "lieutenant"),
200
+ ("hon", "honorable"),
201
+ ("sgt", "sergeant"),
202
+ ("capt", "captain"),
203
+ ("esq", "esquire"),
204
+ ("ltd", "limited"),
205
+ ("col", "colonel"),
206
+ ("ft", "fort"),
207
+ ]
208
+ ]
209
+
210
+
211
+ # List of (ipa, lazy ipa) pairs:
212
+ _lazy_ipa = [
213
+ (re.compile("%s" % x[0]), x[1])
214
+ for x in [
215
+ ("r", "ɹ"),
216
+ ("æ", "e"),
217
+ ("ɑ", "a"),
218
+ ("ɔ", "o"),
219
+ ("ð", "z"),
220
+ ("θ", "s"),
221
+ ("ɛ", "e"),
222
+ ("ɪ", "i"),
223
+ ("ʊ", "u"),
224
+ ("ʒ", "ʥ"),
225
+ ("ʤ", "ʥ"),
226
+ ("ˈ", "↓"),
227
+ ]
228
+ ]
229
+
230
+ # List of (ipa, lazy ipa2) pairs:
231
+ _lazy_ipa2 = [
232
+ (re.compile("%s" % x[0]), x[1])
233
+ for x in [
234
+ ("r", "ɹ"),
235
+ ("ð", "z"),
236
+ ("θ", "s"),
237
+ ("ʒ", "ʑ"),
238
+ ("ʤ", "dʑ"),
239
+ ("ˈ", "↓"),
240
+ ]
241
+ ]
242
+
243
+ # List of (ipa, ipa2) pairs
244
+ _ipa_to_ipa2 = [
245
+ (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
246
+ ]
247
+
248
+
249
+ def _expand_dollars(m):
250
+ match = m.group(1)
251
+ parts = match.split(".")
252
+ if len(parts) > 2:
253
+ return match + " dollars" # Unexpected format
254
+ dollars = int(parts[0]) if parts[0] else 0
255
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
256
+ if dollars and cents:
257
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
258
+ cent_unit = "cent" if cents == 1 else "cents"
259
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
260
+ elif dollars:
261
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
262
+ return "%s %s" % (dollars, dollar_unit)
263
+ elif cents:
264
+ cent_unit = "cent" if cents == 1 else "cents"
265
+ return "%s %s" % (cents, cent_unit)
266
+ else:
267
+ return "zero dollars"
268
+
269
+
270
+ def _remove_commas(m):
271
+ return m.group(1).replace(",", "")
272
+
273
+
274
+ def _expand_ordinal(m):
275
+ return _inflect.number_to_words(m.group(0))
276
+
277
+
278
+ def _expand_number(m):
279
+ num = int(m.group(0))
280
+ if num > 1000 and num < 3000:
281
+ if num == 2000:
282
+ return "two thousand"
283
+ elif num > 2000 and num < 2010:
284
+ return "two thousand " + _inflect.number_to_words(num % 100)
285
+ elif num % 100 == 0:
286
+ return _inflect.number_to_words(num // 100) + " hundred"
287
+ else:
288
+ return _inflect.number_to_words(
289
+ num, andword="", zero="oh", group=2
290
+ ).replace(", ", " ")
291
+ else:
292
+ return _inflect.number_to_words(num, andword="")
293
+
294
+
295
+ def _expand_decimal_point(m):
296
+ return m.group(1).replace(".", " point ")
297
+
298
+
299
+ def normalize_numbers(text):
300
+ text = re.sub(_comma_number_re, _remove_commas, text)
301
+ text = re.sub(_pounds_re, r"\1 pounds", text)
302
+ text = re.sub(_dollars_re, _expand_dollars, text)
303
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
304
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
305
+ text = re.sub(_number_re, _expand_number, text)
306
+ return text
307
+
308
+
309
+ def text_normalize(text):
310
+ text = normalize_numbers(text)
311
+ return text
312
+
313
+
314
+ def g2p(text):
315
+ phones = []
316
+ tones = []
317
+ word2ph = []
318
+ words = re.split(r"([,;.\-\?\!\s+])", text)
319
+ words = [word for word in words if word.strip() != ""]
320
+ for word in words:
321
+ if word.upper() in eng_dict:
322
+ phns, tns = refine_syllables(eng_dict[word.upper()])
323
+ phones += phns
324
+ tones += tns
325
+ word2ph.append(len(phns))
326
+ else:
327
+ phone_list = list(filter(lambda p: p != " ", _g2p(word)))
328
+ for ph in phone_list:
329
+ if ph in arpa:
330
+ ph, tn = refine_ph(ph)
331
+ phones.append(ph)
332
+ tones.append(tn)
333
+ else:
334
+ phones.append(ph)
335
+ tones.append(0)
336
+ word2ph.append(len(phone_list))
337
+
338
+ phones = [post_replace_ph(i) for i in phones]
339
+
340
+ phones = ["_"] + phones + ["_"]
341
+ tones = [0] + tones + [0]
342
+ word2ph = [1] + word2ph + [1]
343
+
344
+ return phones, tones, word2ph
345
+
346
+
347
+ def get_bert_feature(text, word2ph):
348
+ from text import english_bert_mock
349
+
350
+ return english_bert_mock.get_bert_feature(text, word2ph)
351
+
352
+
353
+ if __name__ == "__main__":
354
+ # print(get_dict())
355
+ # print(eng_word_to_phoneme("hello"))
356
+ print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
357
+ # all_phones = set()
358
+ # for k, syllables in eng_dict.items():
359
+ # for group in syllables:
360
+ # for ph in group:
361
+ # all_phones.add(ph)
362
+ # print(all_phones)
onnx_modules/V200/text/english_bert_mock.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import DebertaV2Model, DebertaV2Tokenizer
5
+
6
+ from config import config
7
+
8
+
9
+ LOCAL_PATH = "./bert/deberta-v3-large"
10
+
11
+ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12
+
13
+ models = dict()
14
+
15
+
16
+ def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
17
+ if (
18
+ sys.platform == "darwin"
19
+ and torch.backends.mps.is_available()
20
+ and device == "cpu"
21
+ ):
22
+ device = "mps"
23
+ if not device:
24
+ device = "cuda"
25
+ if device not in models.keys():
26
+ models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
27
+ with torch.no_grad():
28
+ inputs = tokenizer(text, return_tensors="pt")
29
+ for i in inputs:
30
+ inputs[i] = inputs[i].to(device)
31
+ res = models[device](**inputs, output_hidden_states=True)
32
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
33
+ # assert len(word2ph) == len(text)+2
34
+ word2phone = word2ph
35
+ phone_level_feature = []
36
+ for i in range(len(word2phone)):
37
+ repeat_feature = res[i].repeat(word2phone[i], 1)
38
+ phone_level_feature.append(repeat_feature)
39
+
40
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
41
+
42
+ return phone_level_feature.T