Plana-Archive commited on
Commit
ae91fda
·
verified ·
1 Parent(s): 02e3f87

Migrasi VITS Model ke Folder Baru

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +36 -0
  2. VITS-MODELS/app.py +290 -0
  3. VITS-MODELS/attentions.py +300 -0
  4. VITS-MODELS/commons.py +172 -0
  5. VITS-MODELS/config/config.json +55 -0
  6. VITS-MODELS/mel_processing.py +101 -0
  7. VITS-MODELS/models.py +533 -0
  8. VITS-MODELS/modules.py +388 -0
  9. VITS-MODELS/monotonic_align/__init__.py +20 -0
  10. VITS-MODELS/monotonic_align/core.py +36 -0
  11. VITS-MODELS/pretrained_models/abyssinvoker/abyssinvoker.pth +3 -0
  12. VITS-MODELS/pretrained_models/abyssinvoker/cover.png +3 -0
  13. VITS-MODELS/pretrained_models/alice/alice.pth +3 -0
  14. VITS-MODELS/pretrained_models/alice/cover.png +3 -0
  15. VITS-MODELS/pretrained_models/ameth/ameth.pth +3 -0
  16. VITS-MODELS/pretrained_models/ameth/cover.png +3 -0
  17. VITS-MODELS/pretrained_models/asuna/asuna.pth +3 -0
  18. VITS-MODELS/pretrained_models/asuna/cover.png +3 -0
  19. VITS-MODELS/pretrained_models/ayaka-jp/ayaka-jp.pth +3 -0
  20. VITS-MODELS/pretrained_models/ayaka-jp/cover.png +3 -0
  21. VITS-MODELS/pretrained_models/azusa/azusa.pth +3 -0
  22. VITS-MODELS/pretrained_models/azusa/cover.png +3 -0
  23. VITS-MODELS/pretrained_models/bronya/bronya.pth +3 -0
  24. VITS-MODELS/pretrained_models/bronya/cover.png +0 -0
  25. VITS-MODELS/pretrained_models/chisato/chisato.pth +3 -0
  26. VITS-MODELS/pretrained_models/chisato/cover.png +3 -0
  27. VITS-MODELS/pretrained_models/doom/cover.png +3 -0
  28. VITS-MODELS/pretrained_models/doom/doom.pth +3 -0
  29. VITS-MODELS/pretrained_models/echo/cover.png +3 -0
  30. VITS-MODELS/pretrained_models/echo/echo.pth +3 -0
  31. VITS-MODELS/pretrained_models/eriko/cover.png +3 -0
  32. VITS-MODELS/pretrained_models/eriko/eriko.pth +3 -0
  33. VITS-MODELS/pretrained_models/eula/cover.png +3 -0
  34. VITS-MODELS/pretrained_models/eula/eula.pth +3 -0
  35. VITS-MODELS/pretrained_models/hatsune/cover.png +3 -0
  36. VITS-MODELS/pretrained_models/hatsune/hatsune.pth +3 -0
  37. VITS-MODELS/pretrained_models/herta/cover.png +3 -0
  38. VITS-MODELS/pretrained_models/herta/herta.pth +3 -0
  39. VITS-MODELS/pretrained_models/hina/cover.png +3 -0
  40. VITS-MODELS/pretrained_models/hina/hina.pth +3 -0
  41. VITS-MODELS/pretrained_models/hiyori/cover.png +3 -0
  42. VITS-MODELS/pretrained_models/hiyori/hiyori.pth +3 -0
  43. VITS-MODELS/pretrained_models/hoshino/cover.png +3 -0
  44. VITS-MODELS/pretrained_models/hoshino/hoshino.pth +3 -0
  45. VITS-MODELS/pretrained_models/info.json +420 -0
  46. VITS-MODELS/pretrained_models/iori/cover.png +3 -0
  47. VITS-MODELS/pretrained_models/iori/iori.pth +3 -0
  48. VITS-MODELS/pretrained_models/iroha/cover.png +0 -0
  49. VITS-MODELS/pretrained_models/iroha/iroha.pth +3 -0
  50. VITS-MODELS/pretrained_models/izuna/cover.png +3 -0
.gitattributes CHANGED
@@ -33,3 +33,39 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ VITS-MODELS/pretrained_models/abyssinvoker/cover.png filter=lfs diff=lfs merge=lfs -text
37
+ VITS-MODELS/pretrained_models/alice/cover.png filter=lfs diff=lfs merge=lfs -text
38
+ VITS-MODELS/pretrained_models/ameth/cover.png filter=lfs diff=lfs merge=lfs -text
39
+ VITS-MODELS/pretrained_models/asuna/cover.png filter=lfs diff=lfs merge=lfs -text
40
+ VITS-MODELS/pretrained_models/ayaka-jp/cover.png filter=lfs diff=lfs merge=lfs -text
41
+ VITS-MODELS/pretrained_models/azusa/cover.png filter=lfs diff=lfs merge=lfs -text
42
+ VITS-MODELS/pretrained_models/chisato/cover.png filter=lfs diff=lfs merge=lfs -text
43
+ VITS-MODELS/pretrained_models/doom/cover.png filter=lfs diff=lfs merge=lfs -text
44
+ VITS-MODELS/pretrained_models/echo/cover.png filter=lfs diff=lfs merge=lfs -text
45
+ VITS-MODELS/pretrained_models/eriko/cover.png filter=lfs diff=lfs merge=lfs -text
46
+ VITS-MODELS/pretrained_models/eula/cover.png filter=lfs diff=lfs merge=lfs -text
47
+ VITS-MODELS/pretrained_models/hatsune/cover.png filter=lfs diff=lfs merge=lfs -text
48
+ VITS-MODELS/pretrained_models/herta/cover.png filter=lfs diff=lfs merge=lfs -text
49
+ VITS-MODELS/pretrained_models/hina/cover.png filter=lfs diff=lfs merge=lfs -text
50
+ VITS-MODELS/pretrained_models/hiyori/cover.png filter=lfs diff=lfs merge=lfs -text
51
+ VITS-MODELS/pretrained_models/hoshino/cover.png filter=lfs diff=lfs merge=lfs -text
52
+ VITS-MODELS/pretrained_models/iori/cover.png filter=lfs diff=lfs merge=lfs -text
53
+ VITS-MODELS/pretrained_models/izuna/cover.png filter=lfs diff=lfs merge=lfs -text
54
+ VITS-MODELS/pretrained_models/kafka/cover.png filter=lfs diff=lfs merge=lfs -text
55
+ VITS-MODELS/pretrained_models/karin/cover.png filter=lfs diff=lfs merge=lfs -text
56
+ VITS-MODELS/pretrained_models/keqing/cover.png filter=lfs diff=lfs merge=lfs -text
57
+ VITS-MODELS/pretrained_models/kokoro/cover.png filter=lfs diff=lfs merge=lfs -text
58
+ VITS-MODELS/pretrained_models/kyaru/cover.png filter=lfs diff=lfs merge=lfs -text
59
+ VITS-MODELS/pretrained_models/kyoka/cover.png filter=lfs diff=lfs merge=lfs -text
60
+ VITS-MODELS/pretrained_models/mika/cover.png filter=lfs diff=lfs merge=lfs -text
61
+ VITS-MODELS/pretrained_models/misora/cover.png filter=lfs diff=lfs merge=lfs -text
62
+ VITS-MODELS/pretrained_models/miyu/cover.png filter=lfs diff=lfs merge=lfs -text
63
+ VITS-MODELS/pretrained_models/momoi/cover.png filter=lfs diff=lfs merge=lfs -text
64
+ VITS-MODELS/pretrained_models/nahida-jp/cover.png filter=lfs diff=lfs merge=lfs -text
65
+ VITS-MODELS/pretrained_models/pecorine/cover.png filter=lfs diff=lfs merge=lfs -text
66
+ VITS-MODELS/pretrained_models/shiroko/cover.png filter=lfs diff=lfs merge=lfs -text
67
+ VITS-MODELS/pretrained_models/takina/cover.png filter=lfs diff=lfs merge=lfs -text
68
+ VITS-MODELS/pretrained_models/theresa/cover.png filter=lfs diff=lfs merge=lfs -text
69
+ VITS-MODELS/pretrained_models/yuni/cover.png filter=lfs diff=lfs merge=lfs -text
70
+ VITS-MODELS/pretrained_models/yuuka/cover.png filter=lfs diff=lfs merge=lfs -text
71
+ VITS-MODELS/pretrained_models/zenyatta/cover.png filter=lfs diff=lfs merge=lfs -text
VITS-MODELS/app.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ import os
3
+ import re
4
+ import argparse
5
+ import utils
6
+ import commons
7
+ import json
8
+ import torch
9
+ import gradio as gr
10
+ from models import SynthesizerTrn
11
+ from text import text_to_sequence, _clean_text
12
+ from torch import no_grad, LongTensor
13
+ import gradio.processing_utils as gr_processing_utils
14
+ import logging
15
+ logging.getLogger('numba').setLevel(logging.WARNING)
16
+ limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
17
+
18
+ hps_ms = utils.get_hparams_from_file(r'config/config.json')
19
+
20
+ audio_postprocess_ori = gr.Audio.postprocess
21
+
22
+ def audio_postprocess(self, y):
23
+ data = audio_postprocess_ori(self, y)
24
+ if data is None:
25
+ return None
26
+ return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
27
+
28
+
29
+ gr.Audio.postprocess = audio_postprocess
30
+
31
+ def get_text(text, hps, is_symbol):
32
+ text_norm, clean_text = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
33
+ if hps.data.add_blank:
34
+ text_norm = commons.intersperse(text_norm, 0)
35
+ text_norm = LongTensor(text_norm)
36
+ return text_norm, clean_text
37
+
38
+ def create_tts_fn(net_g_ms, speaker_id):
39
+ def tts_fn(text, language, noise_scale, noise_scale_w, length_scale, is_symbol):
40
+ text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
41
+ if limitation:
42
+ text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
43
+ max_len = 100
44
+ if is_symbol:
45
+ max_len *= 3
46
+ if text_len > max_len:
47
+ return "Error: Text is too long", None
48
+ if not is_symbol:
49
+ if language == 0:
50
+ text = f"[ZH]{text}[ZH]"
51
+ elif language == 1:
52
+ text = f"[JA]{text}[JA]"
53
+ else:
54
+ text = f"{text}"
55
+ stn_tst, clean_text = get_text(text, hps_ms, is_symbol)
56
+ with no_grad():
57
+ x_tst = stn_tst.unsqueeze(0).to(device)
58
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
59
+ sid = LongTensor([speaker_id]).to(device)
60
+ audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
61
+ length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
62
+
63
+ return "Success", (22050, audio)
64
+ return tts_fn
65
+
66
+ def create_to_symbol_fn(hps):
67
+ def to_symbol_fn(is_symbol_input, input_text, temp_lang):
68
+ if temp_lang == 0:
69
+ clean_text = f'[ZH]{input_text}[ZH]'
70
+ elif temp_lang == 1:
71
+ clean_text = f'[JA]{input_text}[JA]'
72
+ else:
73
+ clean_text = input_text
74
+ return _clean_text(clean_text, hps.data.text_cleaners) if is_symbol_input else ''
75
+
76
+ return to_symbol_fn
77
+ def change_lang(language):
78
+ if language == 0:
79
+ return 0.6, 0.668, 1.2
80
+ elif language == 1:
81
+ return 0.6, 0.668, 1
82
+ else:
83
+ return 0.6, 0.668, 1
84
+
85
+ download_audio_js = """
86
+ () =>{{
87
+ let root = document.querySelector("body > gradio-app");
88
+ if (root.shadowRoot != null)
89
+ root = root.shadowRoot;
90
+ let audio = root.querySelector("#tts-audio-{audio_id}").querySelector("audio");
91
+ let text = root.querySelector("#input-text-{audio_id}").querySelector("textarea");
92
+ if (audio == undefined)
93
+ return;
94
+ text = text.value;
95
+ if (text == undefined)
96
+ text = Math.floor(Math.random()*100000000);
97
+ audio = audio.src;
98
+ let oA = document.createElement("a");
99
+ oA.download = text.substr(0, 20)+'.wav';
100
+ oA.href = audio;
101
+ document.body.appendChild(oA);
102
+ oA.click();
103
+ oA.remove();
104
+ }}
105
+ """
106
+
107
+ if __name__ == '__main__':
108
+ parser = argparse.ArgumentParser()
109
+ parser.add_argument('--device', type=str, default='cpu')
110
+ parser.add_argument('--api', action="store_true", default=False)
111
+ parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
112
+ parser.add_argument("--all", action="store_true", default=False, help="enable all models")
113
+ args = parser.parse_args()
114
+ device = torch.device(args.device)
115
+ categories = ["Honkai: Star Rail", "Blue Archive", "Lycoris Recoil"]
116
+ others = {
117
+ "Princess Connect! Re:Dive": "https://huggingface.co/spaces/sayashi/vits-models-pcr",
118
+ "Genshin Impact": "https://huggingface.co/spaces/sayashi/vits-models-genshin-bh3",
119
+ "Honkai Impact 3rd": "https://huggingface.co/spaces/sayashi/vits-models-genshin-bh3",
120
+ "Overwatch 2": "https://huggingface.co/spaces/sayashi/vits-models-ow2",
121
+ }
122
+ if args.all:
123
+ categories = ["Honkai: Star Rail", "Blue Archive", "Lycoris Recoil", "Princess Connect! Re:Dive", "Genshin Impact", "Honkai Impact 3rd", "Overwatch 2"]
124
+ others = {}
125
+ models = []
126
+ with open("pretrained_models/info.json", "r", encoding="utf-8") as f:
127
+ models_info = json.load(f)
128
+ for i, info in models_info.items():
129
+ if info['title'].split("-")[0] not in categories or not info['enable']:
130
+ continue
131
+ sid = info['sid']
132
+ name_en = info['name_en']
133
+ name_zh = info['name_zh']
134
+ title = info['title']
135
+ cover = f"pretrained_models/{i}/{info['cover']}"
136
+ example = info['example']
137
+ language = info['language']
138
+ net_g_ms = SynthesizerTrn(
139
+ len(hps_ms.symbols),
140
+ hps_ms.data.filter_length // 2 + 1,
141
+ hps_ms.train.segment_size // hps_ms.data.hop_length,
142
+ n_speakers=hps_ms.data.n_speakers if info['type'] == "multi" else 0,
143
+ **hps_ms.model)
144
+ utils.load_checkpoint(f'pretrained_models/{i}/{i}.pth', net_g_ms, None)
145
+ _ = net_g_ms.eval().to(device)
146
+ models.append((sid, name_en, name_zh, title, cover, example, language, net_g_ms, create_tts_fn(net_g_ms, sid), create_to_symbol_fn(hps_ms)))
147
+ with gr.Blocks() as app:
148
+ gr.Markdown(
149
+ "# <center> vits-models\n"
150
+ "## <center> Please do not generate content that could infringe upon the rights or cause harm to individuals or organizations.\n"
151
+ "## <center> 请不要生成会对个人以及组织造成侵害的内容\n\n"
152
+ "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/10QOk9NPgoKZUXkIhhuVaZ7SYra1MPMKH?usp=share_link)\n\n"
153
+ "[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/sayashi/vits-models?duplicate=true)\n\n"
154
+ "[![Finetune your own model](https://badgen.net/badge/icon/github?icon=github&label=Finetune%20your%20own%20model)](https://github.com/SayaSS/vits-finetuning)"
155
+ )
156
+
157
+ with gr.Tabs():
158
+ for category in categories:
159
+ with gr.TabItem(category):
160
+ with gr.TabItem("EN"):
161
+ for (sid, name_en, name_zh, title, cover, example, language, net_g_ms, tts_fn, to_symbol_fn) in models:
162
+ if title.split("-")[0] != category:
163
+ continue
164
+ with gr.TabItem(name_en):
165
+ with gr.Row():
166
+ gr.Markdown(
167
+ '<div align="center">'
168
+ f'<a><strong>{title}</strong></a>'
169
+ f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else ""
170
+ '</div>'
171
+ )
172
+ with gr.Row():
173
+ with gr.Column():
174
+ input_text = gr.Textbox(label="Text (100 words limitation)" if limitation else "Text", lines=5, value=example, elem_id=f"input-text-en-{name_en.replace(' ','')}")
175
+ lang = gr.Dropdown(label="Language", choices=["Chinese", "Japanese", "Mix(wrap the Chinese text with [ZH][ZH], wrap the Japanese text with [JA][JA])"],
176
+ type="index", value=language)
177
+ with gr.Accordion(label="Advanced Options", open=False):
178
+ symbol_input = gr.Checkbox(value=False, label="Symbol input")
179
+ symbol_list = gr.Dataset(label="Symbol list", components=[input_text],
180
+ samples=[[x] for x in hps_ms.symbols])
181
+ symbol_list_json = gr.Json(value=hps_ms.symbols, visible=False)
182
+ btn = gr.Button(value="Generate", variant="primary")
183
+ with gr.Row():
184
+ ns = gr.Slider(label="noise_scale", minimum=0.1, maximum=1.0, step=0.1, value=0.6, interactive=True)
185
+ nsw = gr.Slider(label="noise_scale_w", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
186
+ ls = gr.Slider(label="length_scale", minimum=0.1, maximum=2.0, step=0.1, value=1.2 if language=="Chinese" else 1, interactive=True)
187
+ with gr.Column():
188
+ o1 = gr.Textbox(label="Output Message")
189
+ o2 = gr.Audio(label="Output Audio", elem_id=f"tts-audio-en-{name_en.replace(' ','')}")
190
+ download = gr.Button("Download Audio")
191
+ btn.click(tts_fn, inputs=[input_text, lang, ns, nsw, ls, symbol_input], outputs=[o1, o2], api_name=f"tts-{name_en}")
192
+ download.click(None, [], [], _js=download_audio_js.format(audio_id=f"en-{name_en.replace(' ', '')}"))
193
+ lang.change(change_lang, inputs=[lang], outputs=[ns, nsw, ls])
194
+ symbol_input.change(
195
+ to_symbol_fn,
196
+ [symbol_input, input_text, lang],
197
+ [input_text]
198
+ )
199
+ symbol_list.click(None, [symbol_list, symbol_list_json], [input_text],
200
+ _js=f"""
201
+ (i,symbols) => {{
202
+ let root = document.querySelector("body > gradio-app");
203
+ if (root.shadowRoot != null)
204
+ root = root.shadowRoot;
205
+ let text_input = root.querySelector("#input-text-en-{name_en.replace(' ', '')}").querySelector("textarea");
206
+ let startPos = text_input.selectionStart;
207
+ let endPos = text_input.selectionEnd;
208
+ let oldTxt = text_input.value;
209
+ let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
210
+ text_input.value = result;
211
+ let x = window.scrollX, y = window.scrollY;
212
+ text_input.focus();
213
+ text_input.selectionStart = startPos + symbols[i].length;
214
+ text_input.selectionEnd = startPos + symbols[i].length;
215
+ text_input.blur();
216
+ window.scrollTo(x, y);
217
+ return text_input.value;
218
+ }}""")
219
+ with gr.TabItem("中文"):
220
+ for (sid, name_en, name_zh, title, cover, example, language, net_g_ms, tts_fn, to_symbol_fn) in models:
221
+ if title.split("-")[0] != category:
222
+ continue
223
+ with gr.TabItem(name_zh):
224
+ with gr.Row():
225
+ gr.Markdown(
226
+ '<div align="center">'
227
+ f'<a><strong>{title}</strong></a>'
228
+ f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else ""
229
+ '</div>'
230
+ )
231
+ with gr.Row():
232
+ with gr.Column():
233
+ input_text = gr.Textbox(label="文本 (100字上限)" if limitation else "文本", lines=5, value=example, elem_id=f"input-text-zh-{name_zh}")
234
+ lang = gr.Dropdown(label="语言", choices=["中文", "日语", "中日混合(中文用[ZH][ZH]包裹起来,日文用[JA][JA]包裹起来)"],
235
+ type="index", value="中文"if language == "Chinese" else "日语")
236
+ with gr.Accordion(label="高级选项", open=False):
237
+ symbol_input = gr.Checkbox(value=False, label="符号输入")
238
+ symbol_list = gr.Dataset(label="符号列表", components=[input_text],
239
+ samples=[[x] for x in hps_ms.symbols])
240
+ symbol_list_json = gr.Json(value=hps_ms.symbols, visible=False)
241
+ btn = gr.Button(value="生成", variant="primary")
242
+ with gr.Row():
243
+ ns = gr.Slider(label="控制感情变化程度", minimum=0.1, maximum=1.0, step=0.1, value=0.6, interactive=True)
244
+ nsw = gr.Slider(label="控制音素发音长度", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
245
+ ls = gr.Slider(label="控制整体语速", minimum=0.1, maximum=2.0, step=0.1, value=1.2 if language=="Chinese" else 1, interactive=True)
246
+ with gr.Column():
247
+ o1 = gr.Textbox(label="输出信息")
248
+ o2 = gr.Audio(label="输出音频", elem_id=f"tts-audio-zh-{name_zh}")
249
+ download = gr.Button("下载音频")
250
+ btn.click(tts_fn, inputs=[input_text, lang, ns, nsw, ls, symbol_input], outputs=[o1, o2])
251
+ download.click(None, [], [], _js=download_audio_js.format(audio_id=f"zh-{name_zh}"))
252
+ lang.change(change_lang, inputs=[lang], outputs=[ns, nsw, ls])
253
+ symbol_input.change(
254
+ to_symbol_fn,
255
+ [symbol_input, input_text, lang],
256
+ [input_text]
257
+ )
258
+ symbol_list.click(None, [symbol_list, symbol_list_json], [input_text],
259
+ _js=f"""
260
+ (i,symbols) => {{
261
+ let root = document.querySelector("body > gradio-app");
262
+ if (root.shadowRoot != null)
263
+ root = root.shadowRoot;
264
+ let text_input = root.querySelector("#input-text-zh-{name_zh}").querySelector("textarea");
265
+ let startPos = text_input.selectionStart;
266
+ let endPos = text_input.selectionEnd;
267
+ let oldTxt = text_input.value;
268
+ let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
269
+ text_input.value = result;
270
+ let x = window.scrollX, y = window.scrollY;
271
+ text_input.focus();
272
+ text_input.selectionStart = startPos + symbols[i].length;
273
+ text_input.selectionEnd = startPos + symbols[i].length;
274
+ text_input.blur();
275
+ window.scrollTo(x, y);
276
+ return text_input.value;
277
+ }}""")
278
+ for category, link in others.items():
279
+ with gr.TabItem(category):
280
+ gr.Markdown(
281
+ f'''
282
+ <center>
283
+ <h2>Click to Go</h2>
284
+ <a href="{link}">
285
+ <img src="https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-xl-dark.svg"
286
+ </a>
287
+ </center>
288
+ '''
289
+ )
290
+ app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
VITS-MODELS/attentions.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ import commons
7
+ from modules import LayerNorm
8
+
9
+
10
+ class Encoder(nn.Module):
11
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
12
+ super().__init__()
13
+ self.hidden_channels = hidden_channels
14
+ self.filter_channels = filter_channels
15
+ self.n_heads = n_heads
16
+ self.n_layers = n_layers
17
+ self.kernel_size = kernel_size
18
+ self.p_dropout = p_dropout
19
+ self.window_size = window_size
20
+
21
+ self.drop = nn.Dropout(p_dropout)
22
+ self.attn_layers = nn.ModuleList()
23
+ self.norm_layers_1 = nn.ModuleList()
24
+ self.ffn_layers = nn.ModuleList()
25
+ self.norm_layers_2 = nn.ModuleList()
26
+ for i in range(self.n_layers):
27
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
28
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
29
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
30
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
31
+
32
+ def forward(self, x, x_mask):
33
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
34
+ x = x * x_mask
35
+ for i in range(self.n_layers):
36
+ y = self.attn_layers[i](x, x, attn_mask)
37
+ y = self.drop(y)
38
+ x = self.norm_layers_1[i](x + y)
39
+
40
+ y = self.ffn_layers[i](x, x_mask)
41
+ y = self.drop(y)
42
+ x = self.norm_layers_2[i](x + y)
43
+ x = x * x_mask
44
+ return x
45
+
46
+
47
+ class Decoder(nn.Module):
48
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
49
+ super().__init__()
50
+ self.hidden_channels = hidden_channels
51
+ self.filter_channels = filter_channels
52
+ self.n_heads = n_heads
53
+ self.n_layers = n_layers
54
+ self.kernel_size = kernel_size
55
+ self.p_dropout = p_dropout
56
+ self.proximal_bias = proximal_bias
57
+ self.proximal_init = proximal_init
58
+
59
+ self.drop = nn.Dropout(p_dropout)
60
+ self.self_attn_layers = nn.ModuleList()
61
+ self.norm_layers_0 = nn.ModuleList()
62
+ self.encdec_attn_layers = nn.ModuleList()
63
+ self.norm_layers_1 = nn.ModuleList()
64
+ self.ffn_layers = nn.ModuleList()
65
+ self.norm_layers_2 = nn.ModuleList()
66
+ for i in range(self.n_layers):
67
+ self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
68
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
69
+ self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
70
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
71
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
72
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
73
+
74
+ def forward(self, x, x_mask, h, h_mask):
75
+ """
76
+ x: decoder input
77
+ h: encoder output
78
+ """
79
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
80
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
81
+ x = x * x_mask
82
+ for i in range(self.n_layers):
83
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
84
+ y = self.drop(y)
85
+ x = self.norm_layers_0[i](x + y)
86
+
87
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
88
+ y = self.drop(y)
89
+ x = self.norm_layers_1[i](x + y)
90
+
91
+ y = self.ffn_layers[i](x, x_mask)
92
+ y = self.drop(y)
93
+ x = self.norm_layers_2[i](x + y)
94
+ x = x * x_mask
95
+ return x
96
+
97
+
98
+ class MultiHeadAttention(nn.Module):
99
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
100
+ super().__init__()
101
+ assert channels % n_heads == 0
102
+
103
+ self.channels = channels
104
+ self.out_channels = out_channels
105
+ self.n_heads = n_heads
106
+ self.p_dropout = p_dropout
107
+ self.window_size = window_size
108
+ self.heads_share = heads_share
109
+ self.block_length = block_length
110
+ self.proximal_bias = proximal_bias
111
+ self.proximal_init = proximal_init
112
+ self.attn = None
113
+
114
+ self.k_channels = channels // n_heads
115
+ self.conv_q = nn.Conv1d(channels, channels, 1)
116
+ self.conv_k = nn.Conv1d(channels, channels, 1)
117
+ self.conv_v = nn.Conv1d(channels, channels, 1)
118
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
119
+ self.drop = nn.Dropout(p_dropout)
120
+
121
+ if window_size is not None:
122
+ n_heads_rel = 1 if heads_share else n_heads
123
+ rel_stddev = self.k_channels**-0.5
124
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
125
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
126
+
127
+ nn.init.xavier_uniform_(self.conv_q.weight)
128
+ nn.init.xavier_uniform_(self.conv_k.weight)
129
+ nn.init.xavier_uniform_(self.conv_v.weight)
130
+ if proximal_init:
131
+ with torch.no_grad():
132
+ self.conv_k.weight.copy_(self.conv_q.weight)
133
+ self.conv_k.bias.copy_(self.conv_q.bias)
134
+
135
+ def forward(self, x, c, attn_mask=None):
136
+ q = self.conv_q(x)
137
+ k = self.conv_k(c)
138
+ v = self.conv_v(c)
139
+
140
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
141
+
142
+ x = self.conv_o(x)
143
+ return x
144
+
145
+ def attention(self, query, key, value, mask=None):
146
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
147
+ b, d, t_s, t_t = (*key.size(), query.size(2))
148
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
149
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
150
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
151
+
152
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
153
+ if self.window_size is not None:
154
+ assert t_s == t_t, "Relative attention is only available for self-attention."
155
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
156
+ rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
157
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
158
+ scores = scores + scores_local
159
+ if self.proximal_bias:
160
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
161
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
162
+ if mask is not None:
163
+ scores = scores.masked_fill(mask == 0, -1e4)
164
+ if self.block_length is not None:
165
+ assert t_s == t_t, "Local attention is only available for self-attention."
166
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
167
+ scores = scores.masked_fill(block_mask == 0, -1e4)
168
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
169
+ p_attn = self.drop(p_attn)
170
+ output = torch.matmul(p_attn, value)
171
+ if self.window_size is not None:
172
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
173
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
174
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
175
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
176
+ return output, p_attn
177
+
178
+ def _matmul_with_relative_values(self, x, y):
179
+ """
180
+ x: [b, h, l, m]
181
+ y: [h or 1, m, d]
182
+ ret: [b, h, l, d]
183
+ """
184
+ ret = torch.matmul(x, y.unsqueeze(0))
185
+ return ret
186
+
187
+ def _matmul_with_relative_keys(self, x, y):
188
+ """
189
+ x: [b, h, l, d]
190
+ y: [h or 1, m, d]
191
+ ret: [b, h, l, m]
192
+ """
193
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
194
+ return ret
195
+
196
+ def _get_relative_embeddings(self, relative_embeddings, length):
197
+ max_relative_position = 2 * self.window_size + 1
198
+ # Pad first before slice to avoid using cond ops.
199
+ pad_length = max(length - (self.window_size + 1), 0)
200
+ slice_start_position = max((self.window_size + 1) - length, 0)
201
+ slice_end_position = slice_start_position + 2 * length - 1
202
+ if pad_length > 0:
203
+ padded_relative_embeddings = F.pad(
204
+ relative_embeddings,
205
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
206
+ else:
207
+ padded_relative_embeddings = relative_embeddings
208
+ used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
209
+ return used_relative_embeddings
210
+
211
+ def _relative_position_to_absolute_position(self, x):
212
+ """
213
+ x: [b, h, l, 2*l-1]
214
+ ret: [b, h, l, l]
215
+ """
216
+ batch, heads, length, _ = x.size()
217
+ # Concat columns of pad to shift from relative to absolute indexing.
218
+ x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
219
+
220
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
221
+ x_flat = x.view([batch, heads, length * 2 * length])
222
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
223
+
224
+ # Reshape and slice out the padded elements.
225
+ x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
226
+ return x_final
227
+
228
+ def _absolute_position_to_relative_position(self, x):
229
+ """
230
+ x: [b, h, l, l]
231
+ ret: [b, h, l, 2*l-1]
232
+ """
233
+ batch, heads, length, _ = x.size()
234
+ # padd along column
235
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
236
+ x_flat = x.view([batch, heads, length**2 + length*(length -1)])
237
+ # add 0's in the beginning that will skew the elements after reshape
238
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
239
+ x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
240
+ return x_final
241
+
242
+ def _attention_bias_proximal(self, length):
243
+ """Bias for self-attention to encourage attention to close positions.
244
+ Args:
245
+ length: an integer scalar.
246
+ Returns:
247
+ a Tensor with shape [1, 1, length, length]
248
+ """
249
+ r = torch.arange(length, dtype=torch.float32)
250
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
251
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
252
+
253
+
254
+ class FFN(nn.Module):
255
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
256
+ super().__init__()
257
+ self.in_channels = in_channels
258
+ self.out_channels = out_channels
259
+ self.filter_channels = filter_channels
260
+ self.kernel_size = kernel_size
261
+ self.p_dropout = p_dropout
262
+ self.activation = activation
263
+ self.causal = causal
264
+
265
+ if causal:
266
+ self.padding = self._causal_padding
267
+ else:
268
+ self.padding = self._same_padding
269
+
270
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
271
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
272
+ self.drop = nn.Dropout(p_dropout)
273
+
274
+ def forward(self, x, x_mask):
275
+ x = self.conv_1(self.padding(x * x_mask))
276
+ if self.activation == "gelu":
277
+ x = x * torch.sigmoid(1.702 * x)
278
+ else:
279
+ x = torch.relu(x)
280
+ x = self.drop(x)
281
+ x = self.conv_2(self.padding(x * x_mask))
282
+ return x * x_mask
283
+
284
+ def _causal_padding(self, x):
285
+ if self.kernel_size == 1:
286
+ return x
287
+ pad_l = self.kernel_size - 1
288
+ pad_r = 0
289
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
290
+ x = F.pad(x, commons.convert_pad_shape(padding))
291
+ return x
292
+
293
+ def _same_padding(self, x):
294
+ if self.kernel_size == 1:
295
+ return x
296
+ pad_l = (self.kernel_size - 1) // 2
297
+ pad_r = self.kernel_size // 2
298
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
299
+ x = F.pad(x, commons.convert_pad_shape(padding))
300
+ return x
VITS-MODELS/commons.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch.nn import functional as F
4
+ import torch.jit
5
+
6
+
7
+ def script_method(fn, _rcb=None):
8
+ return fn
9
+
10
+
11
+ def script(obj, optimize=True, _frames_up=0, _rcb=None):
12
+ return obj
13
+
14
+
15
+ torch.jit.script_method = script_method
16
+ torch.jit.script = script
17
+
18
+
19
+ def init_weights(m, mean=0.0, std=0.01):
20
+ classname = m.__class__.__name__
21
+ if classname.find("Conv") != -1:
22
+ m.weight.data.normal_(mean, std)
23
+
24
+
25
+ def get_padding(kernel_size, dilation=1):
26
+ return int((kernel_size*dilation - dilation)/2)
27
+
28
+
29
+ def convert_pad_shape(pad_shape):
30
+ l = pad_shape[::-1]
31
+ pad_shape = [item for sublist in l for item in sublist]
32
+ return pad_shape
33
+
34
+
35
+ def intersperse(lst, item):
36
+ result = [item] * (len(lst) * 2 + 1)
37
+ result[1::2] = lst
38
+ return result
39
+
40
+
41
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
42
+ """KL(P||Q)"""
43
+ kl = (logs_q - logs_p) - 0.5
44
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
45
+ return kl
46
+
47
+
48
+ def rand_gumbel(shape):
49
+ """Sample from the Gumbel distribution, protect from overflows."""
50
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
51
+ return -torch.log(-torch.log(uniform_samples))
52
+
53
+
54
+ def rand_gumbel_like(x):
55
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
56
+ return g
57
+
58
+
59
+ def slice_segments(x, ids_str, segment_size=4):
60
+ ret = torch.zeros_like(x[:, :, :segment_size])
61
+ for i in range(x.size(0)):
62
+ idx_str = ids_str[i]
63
+ idx_end = idx_str + segment_size
64
+ ret[i] = x[i, :, idx_str:idx_end]
65
+ return ret
66
+
67
+
68
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
69
+ b, d, t = x.size()
70
+ if x_lengths is None:
71
+ x_lengths = t
72
+ ids_str_max = x_lengths - segment_size + 1
73
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
74
+ ret = slice_segments(x, ids_str, segment_size)
75
+ return ret, ids_str
76
+
77
+
78
+ def get_timing_signal_1d(
79
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
80
+ position = torch.arange(length, dtype=torch.float)
81
+ num_timescales = channels // 2
82
+ log_timescale_increment = (
83
+ math.log(float(max_timescale) / float(min_timescale)) /
84
+ (num_timescales - 1))
85
+ inv_timescales = min_timescale * torch.exp(
86
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
87
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
88
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
89
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
90
+ signal = signal.view(1, channels, length)
91
+ return signal
92
+
93
+
94
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
95
+ b, channels, length = x.size()
96
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
97
+ return x + signal.to(dtype=x.dtype, device=x.device)
98
+
99
+
100
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
101
+ b, channels, length = x.size()
102
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
103
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
104
+
105
+
106
+ def subsequent_mask(length):
107
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
108
+ return mask
109
+
110
+
111
+ @torch.jit.script
112
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
113
+ n_channels_int = n_channels[0]
114
+ in_act = input_a + input_b
115
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
116
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
117
+ acts = t_act * s_act
118
+ return acts
119
+
120
+
121
+ def convert_pad_shape(pad_shape):
122
+ l = pad_shape[::-1]
123
+ pad_shape = [item for sublist in l for item in sublist]
124
+ return pad_shape
125
+
126
+
127
+ def shift_1d(x):
128
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
129
+ return x
130
+
131
+
132
+ def sequence_mask(length, max_length=None):
133
+ if max_length is None:
134
+ max_length = length.max()
135
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
136
+ return x.unsqueeze(0) < length.unsqueeze(1)
137
+
138
+
139
+ def generate_path(duration, mask):
140
+ """
141
+ duration: [b, 1, t_x]
142
+ mask: [b, 1, t_y, t_x]
143
+ """
144
+ device = duration.device
145
+
146
+ b, _, t_y, t_x = mask.shape
147
+ cum_duration = torch.cumsum(duration, -1)
148
+
149
+ cum_duration_flat = cum_duration.view(b * t_x)
150
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
151
+ path = path.view(b, t_x, t_y)
152
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
153
+ path = path.unsqueeze(1).transpose(2,3) * mask
154
+ return path
155
+
156
+
157
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
158
+ if isinstance(parameters, torch.Tensor):
159
+ parameters = [parameters]
160
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
161
+ norm_type = float(norm_type)
162
+ if clip_value is not None:
163
+ clip_value = float(clip_value)
164
+
165
+ total_norm = 0
166
+ for p in parameters:
167
+ param_norm = p.grad.data.norm(norm_type)
168
+ total_norm += param_norm.item() ** norm_type
169
+ if clip_value is not None:
170
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
171
+ total_norm = total_norm ** (1. / norm_type)
172
+ return total_norm
VITS-MODELS/config/config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 32,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "training_files":"filelists/xiaoke_train.txt.cleaned",
21
+ "validation_files":"filelists/xiaoke_val.txt.cleaned",
22
+ "text_cleaners":["zh_ja_mixture_cleaners"],
23
+ "max_wav_value": 32768.0,
24
+ "sampling_rate": 22050,
25
+ "filter_length": 1024,
26
+ "hop_length": 256,
27
+ "win_length": 1024,
28
+ "n_mel_channels": 80,
29
+ "mel_fmin": 0.0,
30
+ "mel_fmax": null,
31
+ "add_blank": true,
32
+ "n_speakers": 804,
33
+ "cleaned_text": true
34
+ },
35
+ "model": {
36
+ "inter_channels": 192,
37
+ "hidden_channels": 192,
38
+ "filter_channels": 768,
39
+ "n_heads": 2,
40
+ "n_layers": 6,
41
+ "kernel_size": 3,
42
+ "p_dropout": 0.1,
43
+ "resblock": "1",
44
+ "resblock_kernel_sizes": [3,7,11],
45
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
+ "upsample_rates": [8,8,2,2],
47
+ "upsample_initial_channel": 512,
48
+ "upsample_kernel_sizes": [16,16,4,4],
49
+ "n_layers_q": 3,
50
+ "use_spectral_norm": false,
51
+ "gin_channels": 256
52
+ },
53
+ "speakers": ["\u7279\u522b\u5468", "\u65e0\u58f0\u94c3\u9e7f", "\u4e1c\u6d77\u5e1d\u7687\uff08\u5e1d\u5b9d\uff0c\u5e1d\u738b\uff09", "\u4e38\u5584\u65af\u57fa", "\u5bcc\u58eb\u5947\u8ff9", "\u5c0f\u6817\u5e3d", "\u9ec4\u91d1\u8239", "\u4f0f\u7279\u52a0", "\u5927\u548c\u8d64\u9aa5", "\u5927\u6811\u5feb\u8f66", "\u8349\u4e0a\u98de", "\u83f1\u4e9a\u9a6c\u900a", "\u76ee\u767d\u9ea6\u6606", "\u795e\u9e70", "\u597d\u6b4c\u5267", "\u6210\u7530\u767d\u4ec1", "\u9c81\u9053\u592b\u8c61\u5f81\uff08\u7687\u5e1d\uff09", "\u6c14\u69fd", "\u7231\u4e3d\u6570\u7801", "\u661f\u4e91\u5929\u7a7a", "\u7389\u85fb\u5341\u5b57", "\u7f8e\u5999\u59ff\u52bf", "\u7435\u7436\u6668\u5149", "\u6469\u8036\u91cd\u70ae", "\u66fc\u57ce\u8336\u5ea7", "\u7f8e\u6d66\u6ce2\u65c1", "\u76ee\u767d\u8d56\u6069", "\u83f1\u66d9", "\u96ea\u4e2d\u7f8e\u4eba", "\u7c73\u6d74", "\u827e\u5c3c\u65af\u98ce\u795e", "\u7231\u4e3d\u901f\u5b50\uff08\u7231\u4e3d\u5feb\u5b50\uff09", "\u7231\u6155\u7ec7\u59ec", "\u7a3b\u8377\u4e00", "\u80dc\u5229\u5956\u5238", "\u7a7a\u4e2d\u795e\u5bab", "\u8363\u8fdb\u95ea\u8000", "\u771f\u673a\u4f36", "\u5ddd\u4e0a\u516c\u4e3b", "\u9ec4\u91d1\u57ce\uff08\u9ec4\u91d1\u57ce\u5e02\uff09", "\u6a31\u82b1\u8fdb\u738b", "\u91c7\u73e0", "\u65b0\u5149\u98ce", "\u4e1c\u5546\u53d8\u9769", "\u8d85\u7ea7\u5c0f\u6d77\u6e7e", "\u9192\u76ee\u98de\u9e70\uff08\u5bc4\u5bc4\u5b50\uff09", "\u8352\u6f20\u82f1\u96c4", "\u4e1c\u701b\u4f50\u6566", "\u4e2d\u5c71\u5e86\u5178", "\u6210\u7530\u5927\u8fdb", "\u897f\u91ce\u82b1", "\u6625\u4e3d\uff08\u4e4c\u62c9\u62c9\uff09", "\u9752\u7af9\u56de\u5fc6", "\u5fae\u5149\u98de\u9a79", "\u7f8e\u4e3d\u5468\u65e5", "\u5f85\u517c\u798f\u6765", "mr cb\uff08cb\u5148\u751f\uff09", "\u540d\u5c06\u6012\u6d9b\uff08\u540d\u5c06\u6237\u4ec1\uff09", "\u76ee\u767d\u591a\u4f2f", "\u4f18\u79c0\u7d20\u8d28", "\u5e1d\u738b\u5149\u8f89", "\u5f85\u517c\u8bd7\u6b4c\u5267", "\u751f\u91ce\u72c4\u675c\u65af", "\u76ee\u767d\u5584\u4fe1", "\u5927\u62d3\u592a\u9633\u795e", "\u53cc\u6da1\u8f6e\uff08\u4e24\u7acb\u76f4\uff0c\u4e24\u55b7\u5c04\uff0c\u4e8c\u9505\u5934\uff0c\u9006\u55b7\u5c04\uff09", "\u91cc\u89c1\u5149\u94bb\uff08\u8428\u6258\u8bfa\u91d1\u521a\u77f3\uff09", "\u5317\u90e8\u7384\u9a79", "\u6a31\u82b1\u5343\u4ee3\u738b", "\u5929\u72fc\u661f\u8c61\u5f81", "\u76ee\u767d\u963f\u5c14\u4e39", "\u516b\u91cd\u65e0\u654c", "\u9e64\u4e38\u521a\u5fd7", "\u76ee\u767d\u5149\u660e", "\u6210\u7530\u62dc\u4ec1\uff08\u6210\u7530\u8def\uff09", "\u4e5f\u6587\u6444\u8f89", "\u5c0f\u6797\u5386\u5947", "\u5317\u6e2f\u706b\u5c71", "\u5947\u9510\u9a8f", "\u82e6\u6da9\u7cd6\u971c", "\u5c0f\u5c0f\u8695\u8327", "\u9a8f\u5ddd\u624b\u7eb2\uff08\u7eff\u5e3d\u6076\u9b54\uff09", "\u79cb\u5ddd\u5f25\u751f\uff08\u5c0f\u5c0f\u7406\u4e8b\u957f\uff09", "\u4e59\u540d\u53f2\u60a6\u5b50\uff08\u4e59\u540d\u8bb0\u8005\uff09", "\u6850\u751f\u9662\u8475", "\u5b89\u5fc3\u6cfd\u523a\u523a\u7f8e", "\u6a2b\u672c\u7406\u5b50", "\u795e\u91cc\u7eeb\u534e\uff08\u9f9f\u9f9f\uff09", "\u7434", "\u7a7a\uff08\u7a7a\u54e5\uff09", "\u4e3d\u838e", "\u8367\uff08\u8367\u59b9\uff09", "\u82ad\u82ad\u62c9", "\u51ef\u4e9a", "\u8fea\u5362\u514b", "\u96f7\u6cfd", "\u5b89\u67cf", "\u6e29\u8fea", "\u9999\u83f1", "\u5317\u6597", "\u884c\u79cb", "\u9b48", "\u51dd\u5149", "\u53ef\u8389", "\u949f\u79bb", "\u83f2\u8c22\u5c14\uff08\u7687\u5973\uff09", "\u73ed\u5c3c\u7279", "\u8fbe\u8fbe\u5229\u4e9a\uff08\u516c\u5b50\uff09", "\u8bfa\u827e\u5c14\uff08\u5973\u4ec6\uff09", "\u4e03\u4e03", "\u91cd\u4e91", "\u7518\u96e8\uff08\u6930\u7f8a\uff09", "\u963f\u8d1d\u591a", "\u8fea\u5965\u5a1c\uff08\u732b\u732b\uff09", "\u83ab\u5a1c", "\u5c0f\u53ef", "\u7802\u7cd6", "\u8f9b\u7131", "\u7f57\u838e\u8389\u4e9a", "\u80e1\u6843", "\u67ab\u539f\u4e07\u53f6\uff08\u4e07\u53f6\uff09", "\u70df\u7eef", "\u5bb5\u5bab", "\u6258\u9a6c", "\u4f18\u83c8", "\u96f7\u7535\u5c06\u519b\uff08\u96f7\u795e\uff09", "\u65e9\u67da", "\u73ca\u745a\u5bab\u5fc3\u6d77\uff08\u5fc3\u6d77\uff0c\u6263\u6263\u7c73\uff09", "\u4e94\u90ce", "\u4e5d\u6761\u88df\u7f57", "\u8352\u6cf7\u4e00\u6597\uff08\u4e00\u6597\uff09", "\u57c3\u6d1b\u4f0a", "\u7533\u9e64", "\u516b\u91cd\u795e\u5b50\uff08\u795e\u5b50\uff09", "\u795e\u91cc\u7eeb\u4eba\uff08\u7eeb\u4eba\uff09", "\u591c\u5170", "\u4e45\u5c90\u5fcd", "\u9e7f\u91ce\u82d1\u5e73\u85cf", "\u63d0\u7eb3\u91cc", "\u67ef\u83b1", "\u591a\u8389", "\u4e91\u5807", "\u7eb3\u897f\u59b2\uff08\u8349\u795e\uff09", "\u6df1\u6e0a\u4f7f\u5f92", "\u59ae\u9732", "\u8d5b\u8bfa", "\u503a\u52a1\u5904\u7406\u4eba", "\u574e\u8482\u4e1d", "\u771f\u5f13\u5feb\u8f66", "\u79cb\u4eba", "\u671b\u65cf", "\u827e\u5c14\u83f2", "\u827e\u8389\u4e1d", "\u827e\u4f26", "\u963f\u6d1b\u74e6", "\u5929\u91ce", "\u5929\u76ee\u5341\u4e94", "\u611a\u4eba\u4f17-\u5b89\u5fb7\u70c8", "\u5b89\u987a", "\u5b89\u897f", "\u8475", "\u9752\u6728", "\u8352\u5ddd\u5e78\u6b21", "\u8352\u8c37", "\u6709\u6cfd", "\u6d45\u5ddd", "\u9ebb\u7f8e", "\u51dd\u5149\u52a9\u624b", "\u963f\u6258", "\u7afa\u5b50", "\u767e\u8bc6", "\u767e\u95fb", "\u767e\u6653", "\u767d\u672f", "\u8d1d\u96c5\u7279\u4e3d\u5947", "\u4e3d\u5854", "\u5931\u843d\u8ff7\u8fed", "\u7f2d\u4e71\u661f\u68d8", "\u4f0a\u7538", "\u4f0f\u7279\u52a0\u5973\u5b69", "\u72c2\u70ed\u84dd\u8c03", "\u8389\u8389\u5a05", "\u841d\u838e\u8389\u5a05", "\u516b\u91cd\u6a31", "\u516b\u91cd\u971e", "\u5361\u83b2", "\u7b2c\u516d\u591c\u60f3\u66f2", "\u5361\u841d\u5c14", "\u59ec\u5b50", "\u6781\u5730\u6218\u5203", "\u5e03\u6d1b\u59ae\u5a05", "\u6b21\u751f\u94f6\u7ffc", "\u7406\u4e4b\u5f8b\u8005%26\u5e0c\u513f", "\u7406\u4e4b\u5f8b\u8005", "\u8ff7\u57ce\u9a87\u5154", "\u5e0c\u513f", "\u9b47\u591c\u661f\u6e0a", "\u9ed1\u5e0c\u513f", "\u5e15\u6735\u83f2\u8389\u4e1d", "\u4e0d\u706d\u661f\u951a", "\u5929\u5143\u9a91\u82f1", "\u5e7d\u5170\u9edb\u5c14", "\u6d3e\u8499bh3", "\u7231\u9171", "\u7eef\u7389\u4e38", "\u5fb7\u4e3d\u838e", "\u6708\u4e0b\u521d\u62e5", "\u6714\u591c\u89c2\u661f", "\u66ae\u5149\u9a91\u58eb", "\u683c\u857e\u4fee", "\u7559\u4e91\u501f\u98ce\u771f\u541b", "\u6885\u6bd4\u4e4c\u65af", "\u4eff\u72b9\u5927", "\u514b\u83b1\u56e0", "\u5723\u5251\u5e7d\u5170\u9edb\u5c14", "\u5996\u7cbe\u7231\u8389", "\u7279\u65af\u62c9zero", "\u82cd\u7384", "\u82e5\u6c34", "\u897f\u7433", "\u6234\u56e0\u65af\u96f7\u5e03", "\u8d1d\u62c9", "\u8d64\u9e22", "\u9547\u9b42\u6b4c", "\u6e21\u9e26", "\u4eba\u4e4b\u5f8b\u8005", "\u7231\u8389\u5e0c\u96c5", "\u5929\u7a79\u6e38\u4fa0", "\u742a\u4e9a\u5a1c", "\u7a7a\u4e4b\u5f8b\u8005", "\u85aa\u708e\u4e4b\u5f8b\u8005", "\u4e91\u58a8\u4e39\u5fc3", "\u7b26\u534e", "\u8bc6\u4e4b\u5f8b\u8005", "\u7279\u74e6\u6797", "\u7ef4\u5c14\u8587", "\u82bd\u8863", "\u96f7\u4e4b\u5f8b\u8005", "\u65ad\u7f6a\u5f71\u821e", "\u963f\u6ce2\u5c3c\u4e9a", "\u698e\u672c", "\u5384\u5c3c\u65af\u7279", "\u6076\u9f99", "\u8303\u4e8c\u7237", "\u6cd5\u62c9", "\u611a\u4eba\u4f17\u58eb\u5175", "\u611a\u4eba\u4f17\u58eb\u5175a", "\u611a\u4eba\u4f17\u58eb\u5175b", "\u611a\u4eba\u4f17\u58eb\u5175c", "\u611a\u4eba\u4f17a", "\u611a\u4eba\u4f17b", "\u98de\u98de", "\u83f2\u5229\u514b\u65af", "\u5973\u6027\u8ddf\u968f\u8005", "\u9022\u5ca9", "\u6446\u6e21\u4eba", "\u72c2\u8e81\u7684\u7537\u4eba", "\u5965\u5179", "\u8299\u841d\u62c9", "\u8ddf\u968f\u8005", "\u871c\u6c41\u751f\u7269", "\u9ec4\u9ebb\u5b50", "\u6e0a\u4e0a", "\u85e4\u6728", "\u6df1\u89c1", "\u798f\u672c", "\u8299\u84c9", "\u53e4\u6cfd", "\u53e4\u7530", "\u53e4\u5c71", "\u53e4\u8c37\u6607", "\u5085\u4e09\u513f", "\u9ad8\u8001\u516d", "\u77ff\u5de5\u5192", "\u5143\u592a", "\u5fb7\u5b89\u516c", "\u8302\u624d\u516c", "\u6770\u62c9\u5fb7", "\u845b\u7f57\u4e3d", "\u91d1\u5ffd\u5f8b", "\u516c\u4fca", "\u9505\u5df4", "\u6b4c\u5fb7", "\u963f\u8c6a", "\u72d7\u4e09\u513f", "\u845b\u745e\u4e1d", "\u82e5\u5fc3", "\u963f\u5c71\u5a46", "\u602a\u9e1f", "\u5e7f\u7af9", "\u89c2\u6d77", "\u5173\u5b8f", "\u871c\u6c41\u536b\u5175", "\u5b88\u536b1", "\u50b2\u6162\u7684\u5b88\u536b", "\u5bb3\u6015\u7684\u5b88\u536b", "\u8d35\u5b89", "\u76d6\u4f0a", "\u963f\u521b", "\u54c8\u592b\u4e39", "\u65e5\u8bed\u963f\u8d1d\u591a\uff08\u91ce\u5c9b\u5065\u513f\uff09", "\u65e5\u8bed\u57c3\u6d1b\u4f0a\uff08\u9ad8\u57a3\u5f69\u9633\uff09", "\u65e5\u8bed\u5b89\u67cf\uff08\u77f3\u89c1\u821e\u83dc\u9999\uff09", "\u65e5\u8bed\u795e\u91cc\u7eeb\u534e\uff08\u65e9\u89c1\u6c99\u7ec7\uff09", "\u65e5\u8bed\u795e\u91cc\u7eeb\u4eba\uff08\u77f3\u7530\u5f70\uff09", "\u65e5\u8bed\u767d\u672f\uff08\u6e38\u4f50\u6d69\u4e8c\uff09", "\u65e5\u8bed\u82ad\u82ad\u62c9\uff08\u9b3c\u5934\u660e\u91cc\uff09", "\u65e5\u8bed\u5317\u6597\uff08\u5c0f\u6e05\u6c34\u4e9a\u7f8e\uff09", "\u65e5\u8bed\u73ed\u5c3c\u7279\uff08\u9022\u5742\u826f\u592a\uff09", "\u65e5\u8bed\u574e\u8482\u4e1d\uff08\u67da\u6728\u51c9\u9999\uff09", "\u65e5\u8bed\u91cd\u4e91\uff08\u9f50\u85e4\u58ee\u9a6c\uff09", "\u65e5\u8bed\u67ef\u83b1\uff08\u524d\u5ddd\u51c9\u5b50\uff09", "\u65e5\u8bed\u8d5b\u8bfa\uff08\u5165\u91ce\u81ea\u7531\uff09", "\u65e5\u8bed\u6234\u56e0\u65af\u96f7\u5e03\uff08\u6d25\u7530\u5065\u6b21\u90ce\uff09", "\u65e5\u8bed\u8fea\u5362\u514b\uff08\u5c0f\u91ce\u8d24\u7ae0\uff09", "\u65e5\u8bed\u8fea\u5965\u5a1c\uff08\u4e95\u6cfd\u8bd7\u7ec7\uff09", "\u65e5\u8bed\u591a\u8389\uff08\u91d1\u7530\u670b\u5b50\uff09", "\u65e5\u8bed\u4f18\u83c8\uff08\u4f50\u85e4\u5229\u5948\uff09", "\u65e5\u8bed\u83f2\u8c22\u5c14\uff08\u5185\u7530\u771f\u793c\uff09", "\u65e5\u8bed\u7518\u96e8\uff08\u4e0a\u7530\u4e3d\u5948\uff09", "\u65e5\u8bed\uff08\u7560\u4e2d\u7950\uff09", "\u65e5\u8bed\u9e7f\u91ce\u9662\u5e73\u85cf\uff08\u4e95\u53e3\u7950\u4e00\uff09", "\u65e5\u8bed\u7a7a\uff08\u5800\u6c5f\u77ac\uff09", "\u65e5\u8bed\u8367\uff08\u60a0\u6728\u78a7\uff09", "\u65e5\u8bed\u80e1\u6843\uff08\u9ad8\u6865\u674e\u4f9d\uff09", "\u65e5\u8bed\u4e00\u6597\uff08\u897f\u5ddd\u8d35\u6559\uff09", "\u65e5\u8bed\u51ef\u4e9a\uff08\u9e1f\u6d77\u6d69\u8f85\uff09", "\u65e5\u8bed\u4e07\u53f6\uff08\u5c9b\u5d0e\u4fe1\u957f\uff09", "\u65e5\u8bed\u523b\u6674\uff08\u559c\u591a\u6751\u82f1\u68a8\uff09", "\u65e5\u8bed\u53ef\u8389\uff08\u4e45\u91ce\u7f8e\u54b2\uff09", "\u65e5\u8bed\u5fc3\u6d77\uff08\u4e09\u68ee\u94c3\u5b50\uff09", "\u65e5\u8bed\u4e5d\u6761\u88df\u7f57\uff08\u6fd1\u6237\u9ebb\u6c99\u7f8e\uff09", "\u65e5\u8bed\u4e3d\u838e\uff08\u7530\u4e2d\u7406\u60e0\uff09", "\u65e5\u8bed\u83ab\u5a1c\uff08\u5c0f\u539f\u597d\u7f8e\uff09", "\u65e5\u8bed\u7eb3\u897f\u59b2\uff08\u7530\u6751\u7531\u52a0\u8389\uff09", "\u65e5\u8bed\u59ae\u9732\uff08\u91d1\u5143\u5bff\u5b50\uff09", "\u65e5\u8bed\u51dd\u5149\uff08\u5927\u539f\u6c99\u8036\u9999\uff09", "\u65e5\u8bed\u8bfa\u827e\u5c14\uff08\u9ad8\u5c3e\u594f\u97f3\uff09", "\u65e5\u8bed\u5965\u5179\uff08\u589e\u8c37\u5eb7\u7eaa\uff09", "\u65e5\u8bed\u6d3e\u8499\uff08\u53e4\u8d3a\u8475\uff09", "\u65e5\u8bed\u7434\uff08\u658b\u85e4\u5343\u548c\uff09", "\u65e5\u8bed\u4e03\u4e03\uff08\u7530\u6751\u7531\u52a0\u8389\uff09", "\u65e5\u8bed\u96f7\u7535\u5c06\u519b\uff08\u6cfd\u57ce\u7f8e\u96ea\uff09", "\u65e5\u8bed\u96f7\u6cfd\uff08\u5185\u5c71\u6602\u8f89\uff09", "\u65e5\u8bed\u7f57\u838e\u8389\u4e9a\uff08\u52a0\u9688\u4e9a\u8863\uff09", "\u65e5\u8bed\u65e9\u67da\uff08\u6d32\u5d0e\u7eeb\uff09", "\u65e5\u8bed\u6563\u5175\uff08\u67ff\u539f\u5f7b\u4e5f\uff09", "\u65e5\u8bed\u7533\u9e64\uff08\u5ddd\u6f84\u7eeb\u5b50\uff09", "\u65e5\u8bed\u4e45\u5c90\u5fcd\uff08\u6c34\u6865\u9999\u7ec7\uff09", "\u65e5\u8bed\u5973\u58eb\uff08\u5e84\u5b50\u88d5\u8863\uff09", "\u65e5\u8bed\u7802\u7cd6\uff08\u85e4\u7530\u831c\uff09", "\u65e5\u8bed\u8fbe\u8fbe\u5229\u4e9a\uff08\u6728\u6751\u826f\u5e73\uff09", "\u65e5\u8bed\u6258\u9a6c\uff08\u68ee\u7530\u6210\u4e00\uff09", "\u65e5\u8bed\u63d0\u7eb3\u91cc\uff08\u5c0f\u6797\u6c99\u82d7\uff09", "\u65e5\u8bed\u6e29\u8fea\uff08\u6751\u6fd1\u6b65\uff09", "\u65e5\u8bed\u9999\u83f1\uff08\u5c0f\u6cfd\u4e9a\u674e\uff09", "\u65e5\u8bed\u9b48\uff08\u677e\u5188\u796f\u4e1e\uff09", "\u65e5\u8bed\u884c\u79cb\uff08\u7686\u5ddd\u7eaf\u5b50\uff09", "\u65e5\u8bed\u8f9b\u7131\uff08\u9ad8\u6865\u667a\u79cb\uff09", "\u65e5\u8bed\u516b\u91cd\u795e\u5b50\uff08\u4f50\u4ed3\u7eeb\u97f3\uff09", "\u65e5\u8bed\u70df\u7eef\uff08\u82b1\u5b88\u7531\u7f8e\u91cc\uff09", "\u65e5\u8bed\u591c\u5170\uff08\u8fdc\u85e4\u7eeb\uff09", "\u65e5\u8bed\u5bb5\u5bab\uff08\u690d\u7530\u4f73\u5948\uff09", "\u65e5\u8bed\u4e91\u5807\uff08\u5c0f\u5ca9\u4e95\u5c0f\u9e1f\uff09", "\u65e5\u8bed\u949f\u79bb\uff08\u524d\u91ce\u667a\u662d\uff09", "\u6770\u514b", "\u963f\u5409", "\u6c5f\u821f", "\u9274\u79cb", "\u5609\u4e49", "\u7eaa\u82b3", "\u666f\u6f84", "\u7ecf\u7eb6", "\u666f\u660e", "\u664b\u4f18", "\u963f\u9e20", "\u9152\u5ba2", "\u4e54\u5c14", "\u4e54\u745f\u592b", "\u7ea6\u987f", "\u4e54\u4f0a\u65af", "\u5c45\u5b89", "\u541b\u541b", "\u987a\u5409", "\u7eaf\u4e5f", "\u91cd\u4f50", "\u5927\u5c9b\u7eaf\u5e73", "\u84b2\u6cfd", "\u52d8\u89e3\u7531\u5c0f\u8def\u5065\u4e09\u90ce", "\u67ab", "\u67ab\u539f\u4e49\u5e86", "\u836b\u5c71", "\u7532\u6590\u7530\u9f8d\u99ac", "\u6d77\u6597", "\u60df\u795e\u6674\u4e4b\u4ecb", "\u9e7f\u91ce\u5948\u5948", "\u5361\u7435\u8389\u4e9a", "\u51ef\u745f\u7433", "\u52a0\u85e4\u4fe1\u609f", "\u52a0\u85e4\u6d0b\u5e73", "\u80dc\u5bb6", "\u8305\u847a\u4e00\u5e86", "\u548c\u662d", "\u4e00\u6b63", "\u4e00\u9053", "\u6842\u4e00", "\u5e86\u6b21\u90ce", "\u963f\u8d24", "\u5065\u53f8", "\u5065\u6b21\u90ce", "\u5065\u4e09\u90ce", "\u5929\u7406", "\u6740\u624ba", "\u6740\u624bb", "\u6728\u5357\u674f\u5948", "\u6728\u6751", "\u56fd\u738b", "\u6728\u4e0b", "\u5317\u6751", "\u6e05\u60e0", "\u6e05\u4eba", "\u514b\u5217\u95e8\u7279", "\u9a91\u58eb", "\u5c0f\u6797", "\u5c0f\u6625", "\u5eb7\u62c9\u5fb7", "\u5927\u8089\u4e38", "\u7434\u7f8e", "\u5b8f\u4e00", "\u5eb7\u4ecb", "\u5e78\u5fb7", "\u9ad8\u5584", "\u68a2", "\u514b\u7f57\u7d22", "\u4e45\u4fdd", "\u4e5d\u6761\u9570\u6cbb", "\u4e45\u6728\u7530", "\u6606\u94a7", "\u83ca\u5730\u541b", "\u4e45\u5229\u987b", "\u9ed1\u7530", "\u9ed1\u6cfd\u4eac\u4e4b\u4ecb", "\u54cd\u592a", "\u5c9a\u59d0", "\u5170\u6eaa", "\u6f9c\u9633", "\u52b3\u4f26\u65af", "\u4e50\u660e", "\u83b1\u8bfa", "\u83b2", "\u826f\u5b50", "\u674e\u5f53", "\u674e\u4e01", "\u5c0f\u4e50", "\u7075", "\u5c0f\u73b2", "\u7433\u7405a", "\u7433\u7405b", "\u5c0f\u5f6c", "\u5c0f\u5fb7", "\u5c0f\u697d", "\u5c0f\u9f99", "\u5c0f\u5434", "\u5c0f\u5434\u7684\u8bb0\u5fc6", "\u7406\u6b63", "\u963f\u9f99", "\u5362\u5361", "\u6d1b\u6210", "\u7f57\u5de7", "\u5317\u98ce\u72fc", "\u5362\u6b63", "\u840d\u59e5\u59e5", "\u524d\u7530", "\u771f\u663c", "\u9ebb\u7eaa", "\u771f", "\u611a\u4eba\u4f17-\u9a6c\u514b\u897f\u59c6", "\u5973\u6027a", "\u5973\u6027b", "\u5973\u6027a\u7684\u8ddf\u968f\u8005", "\u963f\u5b88", "\u739b\u683c\u4e3d\u7279", "\u771f\u7406", "\u739b\u4e54\u4e3d", "\u739b\u6587", "\u6b63\u80dc", "\u660c\u4fe1", "\u5c06\u53f8", "\u6b63\u4eba", "\u8def\u7237", "\u8001\u7ae0", "\u677e\u7530", "\u677e\u672c", "\u677e\u6d66", "\u677e\u5742", "\u8001\u5b5f", "\u5b5f\u4e39", "\u5546\u4eba\u968f\u4ece", "\u4f20\u4ee4\u5175", "\u7c73\u6b47\u5c14", "\u5fa1\u8206\u6e90\u4e00\u90ce", "\u5fa1\u8206\u6e90\u6b21\u90ce", "\u5343\u5ca9\u519b\u6559\u5934", "\u5343\u5ca9\u519b\u58eb\u5175", "\u660e\u535a", "\u660e\u4fca", "\u7f8e\u94c3", "\u7f8e\u548c", "\u963f\u5e78", "\u524a\u6708\u7b51\u9633\u771f\u541b", "\u94b1\u773c\u513f", "\u68ee\u5f66", "\u5143\u52a9", "\u7406\u6c34\u53e0\u5c71\u771f\u541b", "\u7406\u6c34\u758a\u5c71\u771f\u541b", "\u6731\u8001\u677f", "\u6728\u6728", "\u6751\u4e0a", "\u6751\u7530", "\u6c38\u91ce", "\u957f\u91ce\u539f\u9f99\u4e4b\u4ecb", "\u957f\u6fd1", "\u4e2d\u91ce\u5fd7\u4e43", "\u83dc\u83dc\u5b50", "\u6960\u6960", "\u6210\u6fd1", "\u963f\u5185", "\u5b81\u7984", "\u725b\u5fd7", "\u4fe1\u535a", "\u4f38\u592b", "\u91ce\u65b9", "\u8bfa\u62c9", "\u7eaa\u9999", "\u8bfa\u66fc", "\u4fee\u5973", "\u7eaf\u6c34\u7cbe\u7075", "\u5c0f\u5ddd", "\u5c0f\u4ed3\u6faa", "\u5188\u6797", "\u5188\u5d0e\u7ed8\u91cc\u9999", "\u5188\u5d0e\u9646\u6597", "\u5965\u62c9\u592b", "\u8001\u79d1", "\u9b3c\u5a46\u5a46", "\u5c0f\u91ce\u5bfa", "\u5927\u6cb3\u539f\u4e94\u53f3\u536b\u95e8", "\u5927\u4e45\u4fdd\u5927\u4ecb", "\u5927\u68ee", "\u5927\u52a9", "\u5965\u7279", "\u6d3e\u8499", "\u6d3e\u84992", "\u75c5\u4ebaa", "\u75c5\u4ebab", "\u5df4\u987f", "\u6d3e\u6069", "\u670b\u4e49", "\u56f4\u89c2\u7fa4\u4f17", "\u56f4\u89c2\u7fa4\u4f17a", "\u56f4\u89c2\u7fa4\u4f17b", "\u56f4\u89c2\u7fa4\u4f17c", "\u56f4\u89c2\u7fa4\u4f17d", "\u56f4\u89c2\u7fa4\u4f17e", "\u94dc\u96c0", "\u963f\u80a5", "\u5174\u53d4", "\u8001\u5468\u53d4", "\u516c\u4e3b", "\u5f7c\u5f97", "\u4e7e\u5b50", "\u828a\u828a", "\u4e7e\u73ae", "\u7eee\u547d", "\u675e\u5e73", "\u79cb\u6708", "\u6606\u6069", "\u96f7\u7535\u5f71", "\u5170\u9053\u5c14", "\u96f7\u8499\u5fb7", "\u5192\u5931\u7684\u5e15\u62c9\u5fb7", "\u4f36\u4e00", "\u73b2\u82b1", "\u963f\u4ec1", "\u5bb6\u81e3\u4eec", "\u68a8\u7ed8", "\u8363\u6c5f", "\u620e\u4e16", "\u6d6a\u4eba", "\u7f57\u4f0a\u65af", "\u5982\u610f", "\u51c9\u5b50", "\u5f69\u9999", "\u9152\u4e95", "\u5742\u672c", "\u6714\u6b21\u90ce", "\u6b66\u58eba", "\u6b66\u58ebb", "\u6b66\u58ebc", "\u6b66\u58ebd", "\u73ca\u745a", "\u4e09\u7530", "\u838e\u62c9", "\u7b39\u91ce", "\u806a\u7f8e", "\u806a", "\u5c0f\u767e\u5408", "\u6563\u5175", "\u5bb3\u6015\u7684\u5c0f\u5218", "\u8212\u4f2f\u7279", "\u8212\u8328", "\u6d77\u9f99", "\u4e16\u5b50", "\u8c22\u5c14\u76d6", "\u5bb6\u4e01", "\u5546\u534e", "\u6c99\u5bc5", "\u963f\u5347", "\u67f4\u7530", "\u963f\u8302", "\u5f0f\u5927\u5c06", "\u6e05\u6c34", "\u5fd7\u6751\u52d8\u5175\u536b", "\u65b0\u4e4b\u4e1e", "\u5fd7\u7ec7", "\u77f3\u5934", "\u8bd7\u7fbd", "\u8bd7\u7b60", "\u77f3\u58ee", "\u7fd4\u592a", "\u6b63\u4e8c", "\u5468\u5e73", "\u8212\u6768", "\u9f50\u683c\u8299\u4e3d\u96c5", "\u5973\u58eb", "\u601d\u52e4", "\u516d\u6307\u4e54\u745f", "\u611a\u4eba\u4f17\u5c0f\u5175d", "\u611a\u4eba\u4f17\u5c0f\u5175a", "\u611a\u4eba\u4f17\u5c0f\u5175b", "\u611a\u4eba\u4f17\u5c0f\u5175c", "\u5434\u8001\u4e94", "\u5434\u8001\u4e8c", "\u6ed1\u5934\u9b3c", "\u8a00\u7b11", "\u5434\u8001\u4e03", "\u58eb\u5175h", "\u58eb\u5175i", "\u58eb\u5175a", "\u58eb\u5175b", "\u58eb\u5175c", "\u58eb\u5175d", "\u58eb\u5175e", "\u58eb\u5175f", "\u58eb\u5175g", "\u594f\u592a", "\u65af\u5766\u5229", "\u6387\u661f\u652b\u8fb0\u5929\u541b", "\u5c0f\u5934", "\u5927\u6b66", "\u9676\u4e49\u9686", "\u6749\u672c", "\u82cf\u897f", "\u5acc\u7591\u4ebaa", "\u5acc\u7591\u4ebab", "\u5acc\u7591\u4ebac", "\u5acc\u7591\u4ebad", "\u65af\u4e07", "\u5251\u5ba2a", "\u5251\u5ba2b", "\u963f\u4e8c", "\u5fe0\u80dc", "\u5fe0\u592b", "\u963f\u656c", "\u5b5d\u5229", "\u9e70\u53f8\u8fdb", "\u9ad8\u5c71", "\u4e5d\u6761\u5b5d\u884c", "\u6bc5", "\u7af9\u5185", "\u62d3\u771f", "\u5353\u4e5f", "\u592a\u90ce\u4e38", "\u6cf0\u52d2", "\u624b\u5c9b", "\u54f2\u5e73", "\u54f2\u592b", "\u6258\u514b", "\u5927boss", "\u963f\u5f3a", "\u6258\u5c14\u5fb7\u62c9", "\u65c1\u89c2\u8005", "\u5929\u6210", "\u963f\u5927", "\u8482\u739b\u4e4c\u65af", "\u63d0\u7c73", "\u6237\u7530", "\u963f\u4e09", "\u4e00\u8d77\u7684\u4eba", "\u5fb7\u7530", "\u5fb7\u957f", "\u667a\u6811", "\u5229\u5f66", "\u80d6\u4e4e\u4e4e\u7684\u65c5\u884c\u8005", "\u85cf\u5b9d\u4ebaa", "\u85cf\u5b9d\u4ebab", "\u85cf\u5b9d\u4ebac", "\u85cf\u5b9d\u4ebad", "\u963f\u7947", "\u6052\u96c4", "\u9732\u5b50", "\u8bdd\u5267\u56e2\u56e2\u957f", "\u5185\u6751", "\u4e0a\u91ce", "\u4e0a\u6749", "\u8001\u6234", "\u8001\u9ad8", "\u8001\u8d3e", "\u8001\u58a8", "\u8001\u5b59", "\u5929\u67a2\u661f", "\u8001\u4e91", "\u6709\u4e50\u658b", "\u4e11\u96c4", "\u4e4c\u7ef4", "\u74e6\u4eac", "\u83f2\u5c14\u6208\u9edb\u7279", "\u7ef4\u591a\u5229\u4e9a", "\u8587\u5c14", "\u74e6\u683c\u7eb3", "\u963f\u5916", "\u4f8d\u5973", "\u74e6\u62c9", "\u671b\u96c5", "\u5b9b\u70df", "\u742c\u7389", "\u6218\u58eba", "\u6218\u58ebb", "\u6e21\u8fba", "\u6e21\u90e8", "\u963f\u4f1f", "\u6587\u749f", "\u6587\u6e0a", "\u97e6\u5c14\u7eb3", "\u738b\u6273\u624b", "\u6b66\u6c9b", "\u6653\u98de", "\u8f9b\u7a0b", "\u661f\u706b", "\u661f\u7a00", "\u8f9b\u79c0", "\u79c0\u534e", "\u963f\u65ed", "\u5f90\u5218\u5e08", "\u77e2\u90e8", "\u516b\u6728", "\u5c71\u4e0a", "\u963f\u9633", "\u989c\u7b11", "\u5eb7\u660e", "\u6cf0\u4e45", "\u5b89\u6b66", "\u77e2\u7530\u5e78\u559c", "\u77e2\u7530\u8f9b\u559c", "\u4e49\u575a", "\u83ba\u513f", "\u76c8\u4e30", "\u5b9c\u5e74", "\u94f6\u674f", "\u9038\u8f69", "\u6a2a\u5c71", "\u6c38\u8d35", "\u6c38\u4e1a", "\u5609\u4e45", "\u5409\u5ddd", "\u4e49\u9ad8", "\u7528\u9ad8", "\u9633\u592a", "\u5143\u84c9", "\u73a5\u8f89", "\u6bd3\u534e", "\u6709\u9999", "\u5e78\u4e5f", "\u7531\u771f", "\u7ed3\u83dc", "\u97f5\u5b81", "\u767e\u5408", "\u767e\u5408\u534e", "\u5c24\u82cf\u6ce2\u592b", "\u88d5\u5b50", "\u60a0\u7b56", "\u60a0\u4e5f", "\u4e8e\u5ae3", "\u67da\u5b50", "\u8001\u90d1", "\u6b63\u8302", "\u5fd7\u6210", "\u82b7\u5de7", "\u77e5\u6613", "\u652f\u652f", "\u5468\u826f", "\u73e0\u51fd", "\u795d\u660e", "\u795d\u6d9b"],
54
+ "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u207c", "\u02b0", "`", "\u2192", "\u2193", "\u2191", " "]
55
+ }
VITS-MODELS/mel_processing.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.utils.data
3
+ from librosa.filters import mel as librosa_mel_fn
4
+
5
+ MAX_WAV_VALUE = 32768.0
6
+
7
+
8
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
9
+ """
10
+ PARAMS
11
+ ------
12
+ C: compression factor
13
+ """
14
+ return torch.log(torch.clamp(x, min=clip_val) * C)
15
+
16
+
17
+ def dynamic_range_decompression_torch(x, C=1):
18
+ """
19
+ PARAMS
20
+ ------
21
+ C: compression factor used to compress
22
+ """
23
+ return torch.exp(x) / C
24
+
25
+
26
+ def spectral_normalize_torch(magnitudes):
27
+ output = dynamic_range_compression_torch(magnitudes)
28
+ return output
29
+
30
+
31
+ def spectral_de_normalize_torch(magnitudes):
32
+ output = dynamic_range_decompression_torch(magnitudes)
33
+ return output
34
+
35
+
36
+ mel_basis = {}
37
+ hann_window = {}
38
+
39
+
40
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
41
+ if torch.min(y) < -1.:
42
+ print('min value is ', torch.min(y))
43
+ if torch.max(y) > 1.:
44
+ print('max value is ', torch.max(y))
45
+
46
+ global hann_window
47
+ dtype_device = str(y.dtype) + '_' + str(y.device)
48
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
49
+ if wnsize_dtype_device not in hann_window:
50
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
51
+
52
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
53
+ y = y.squeeze(1)
54
+
55
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
56
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
57
+
58
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
59
+ return spec
60
+
61
+
62
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
63
+ global mel_basis
64
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
65
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
66
+ if fmax_dtype_device not in mel_basis:
67
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
68
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
69
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
70
+ spec = spectral_normalize_torch(spec)
71
+ return spec
72
+
73
+
74
+ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
75
+ if torch.min(y) < -1.:
76
+ print('min value is ', torch.min(y))
77
+ if torch.max(y) > 1.:
78
+ print('max value is ', torch.max(y))
79
+
80
+ global mel_basis, hann_window
81
+ dtype_device = str(y.dtype) + '_' + str(y.device)
82
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
83
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
84
+ if fmax_dtype_device not in mel_basis:
85
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
86
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
87
+ if wnsize_dtype_device not in hann_window:
88
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
89
+
90
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
91
+ y = y.squeeze(1)
92
+
93
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
94
+ center=center, pad_mode='reflect', normalized=False, onesided=True)
95
+
96
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
97
+
98
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
99
+ spec = spectral_normalize_torch(spec)
100
+
101
+ return spec
VITS-MODELS/models.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ import commons
7
+ import modules
8
+ import attentions
9
+ import monotonic_align
10
+
11
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
12
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
+ from commons import init_weights, get_padding
14
+
15
+
16
+ class StochasticDurationPredictor(nn.Module):
17
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
18
+ super().__init__()
19
+ filter_channels = in_channels # it needs to be removed from future version.
20
+ self.in_channels = in_channels
21
+ self.filter_channels = filter_channels
22
+ self.kernel_size = kernel_size
23
+ self.p_dropout = p_dropout
24
+ self.n_flows = n_flows
25
+ self.gin_channels = gin_channels
26
+
27
+ self.log_flow = modules.Log()
28
+ self.flows = nn.ModuleList()
29
+ self.flows.append(modules.ElementwiseAffine(2))
30
+ for i in range(n_flows):
31
+ self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
32
+ self.flows.append(modules.Flip())
33
+
34
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
35
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
36
+ self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
37
+ self.post_flows = nn.ModuleList()
38
+ self.post_flows.append(modules.ElementwiseAffine(2))
39
+ for i in range(4):
40
+ self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
41
+ self.post_flows.append(modules.Flip())
42
+
43
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
44
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
45
+ self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
46
+ if gin_channels != 0:
47
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
48
+
49
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
50
+ x = torch.detach(x)
51
+ x = self.pre(x)
52
+ if g is not None:
53
+ g = torch.detach(g)
54
+ x = x + self.cond(g)
55
+ x = self.convs(x, x_mask)
56
+ x = self.proj(x) * x_mask
57
+
58
+ if not reverse:
59
+ flows = self.flows
60
+ assert w is not None
61
+
62
+ logdet_tot_q = 0
63
+ h_w = self.post_pre(w)
64
+ h_w = self.post_convs(h_w, x_mask)
65
+ h_w = self.post_proj(h_w) * x_mask
66
+ e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
67
+ z_q = e_q
68
+ for flow in self.post_flows:
69
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
70
+ logdet_tot_q += logdet_q
71
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
72
+ u = torch.sigmoid(z_u) * x_mask
73
+ z0 = (w - u) * x_mask
74
+ logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
75
+ logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
76
+
77
+ logdet_tot = 0
78
+ z0, logdet = self.log_flow(z0, x_mask)
79
+ logdet_tot += logdet
80
+ z = torch.cat([z0, z1], 1)
81
+ for flow in flows:
82
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
83
+ logdet_tot = logdet_tot + logdet
84
+ nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
85
+ return nll + logq # [b]
86
+ else:
87
+ flows = list(reversed(self.flows))
88
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
89
+ z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
90
+ for flow in flows:
91
+ z = flow(z, x_mask, g=x, reverse=reverse)
92
+ z0, z1 = torch.split(z, [1, 1], 1)
93
+ logw = z0
94
+ return logw
95
+
96
+
97
+ class DurationPredictor(nn.Module):
98
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
99
+ super().__init__()
100
+
101
+ self.in_channels = in_channels
102
+ self.filter_channels = filter_channels
103
+ self.kernel_size = kernel_size
104
+ self.p_dropout = p_dropout
105
+ self.gin_channels = gin_channels
106
+
107
+ self.drop = nn.Dropout(p_dropout)
108
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
109
+ self.norm_1 = modules.LayerNorm(filter_channels)
110
+ self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
111
+ self.norm_2 = modules.LayerNorm(filter_channels)
112
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
113
+
114
+ if gin_channels != 0:
115
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
116
+
117
+ def forward(self, x, x_mask, g=None):
118
+ x = torch.detach(x)
119
+ if g is not None:
120
+ g = torch.detach(g)
121
+ x = x + self.cond(g)
122
+ x = self.conv_1(x * x_mask)
123
+ x = torch.relu(x)
124
+ x = self.norm_1(x)
125
+ x = self.drop(x)
126
+ x = self.conv_2(x * x_mask)
127
+ x = torch.relu(x)
128
+ x = self.norm_2(x)
129
+ x = self.drop(x)
130
+ x = self.proj(x * x_mask)
131
+ return x * x_mask
132
+
133
+
134
+ class TextEncoder(nn.Module):
135
+ def __init__(self,
136
+ n_vocab,
137
+ out_channels,
138
+ hidden_channels,
139
+ filter_channels,
140
+ n_heads,
141
+ n_layers,
142
+ kernel_size,
143
+ p_dropout):
144
+ super().__init__()
145
+ self.n_vocab = n_vocab
146
+ self.out_channels = out_channels
147
+ self.hidden_channels = hidden_channels
148
+ self.filter_channels = filter_channels
149
+ self.n_heads = n_heads
150
+ self.n_layers = n_layers
151
+ self.kernel_size = kernel_size
152
+ self.p_dropout = p_dropout
153
+
154
+ self.emb = nn.Embedding(n_vocab, hidden_channels)
155
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
156
+
157
+ self.encoder = attentions.Encoder(
158
+ hidden_channels,
159
+ filter_channels,
160
+ n_heads,
161
+ n_layers,
162
+ kernel_size,
163
+ p_dropout)
164
+ self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
165
+
166
+ def forward(self, x, x_lengths):
167
+ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
168
+ x = torch.transpose(x, 1, -1) # [b, h, t]
169
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
170
+
171
+ x = self.encoder(x * x_mask, x_mask)
172
+ stats = self.proj(x) * x_mask
173
+
174
+ m, logs = torch.split(stats, self.out_channels, dim=1)
175
+ return x, m, logs, x_mask
176
+
177
+
178
+ class ResidualCouplingBlock(nn.Module):
179
+ def __init__(self,
180
+ channels,
181
+ hidden_channels,
182
+ kernel_size,
183
+ dilation_rate,
184
+ n_layers,
185
+ n_flows=4,
186
+ gin_channels=0):
187
+ super().__init__()
188
+ self.channels = channels
189
+ self.hidden_channels = hidden_channels
190
+ self.kernel_size = kernel_size
191
+ self.dilation_rate = dilation_rate
192
+ self.n_layers = n_layers
193
+ self.n_flows = n_flows
194
+ self.gin_channels = gin_channels
195
+
196
+ self.flows = nn.ModuleList()
197
+ for i in range(n_flows):
198
+ self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
199
+ self.flows.append(modules.Flip())
200
+
201
+ def forward(self, x, x_mask, g=None, reverse=False):
202
+ if not reverse:
203
+ for flow in self.flows:
204
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
205
+ else:
206
+ for flow in reversed(self.flows):
207
+ x = flow(x, x_mask, g=g, reverse=reverse)
208
+ return x
209
+
210
+
211
+ class PosteriorEncoder(nn.Module):
212
+ def __init__(self,
213
+ in_channels,
214
+ out_channels,
215
+ hidden_channels,
216
+ kernel_size,
217
+ dilation_rate,
218
+ n_layers,
219
+ gin_channels=0):
220
+ super().__init__()
221
+ self.in_channels = in_channels
222
+ self.out_channels = out_channels
223
+ self.hidden_channels = hidden_channels
224
+ self.kernel_size = kernel_size
225
+ self.dilation_rate = dilation_rate
226
+ self.n_layers = n_layers
227
+ self.gin_channels = gin_channels
228
+
229
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
230
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
231
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
232
+
233
+ def forward(self, x, x_lengths, g=None):
234
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
235
+ x = self.pre(x) * x_mask
236
+ x = self.enc(x, x_mask, g=g)
237
+ stats = self.proj(x) * x_mask
238
+ m, logs = torch.split(stats, self.out_channels, dim=1)
239
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
240
+ return z, m, logs, x_mask
241
+
242
+
243
+ class Generator(torch.nn.Module):
244
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
245
+ super(Generator, self).__init__()
246
+ self.num_kernels = len(resblock_kernel_sizes)
247
+ self.num_upsamples = len(upsample_rates)
248
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
249
+ resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
250
+
251
+ self.ups = nn.ModuleList()
252
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
253
+ self.ups.append(weight_norm(
254
+ ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
255
+ k, u, padding=(k-u)//2)))
256
+
257
+ self.resblocks = nn.ModuleList()
258
+ for i in range(len(self.ups)):
259
+ ch = upsample_initial_channel//(2**(i+1))
260
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
261
+ self.resblocks.append(resblock(ch, k, d))
262
+
263
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
264
+ self.ups.apply(init_weights)
265
+
266
+ if gin_channels != 0:
267
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
268
+
269
+ def forward(self, x, g=None):
270
+ x = self.conv_pre(x)
271
+ if g is not None:
272
+ x = x + self.cond(g)
273
+
274
+ for i in range(self.num_upsamples):
275
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
276
+ x = self.ups[i](x)
277
+ xs = None
278
+ for j in range(self.num_kernels):
279
+ if xs is None:
280
+ xs = self.resblocks[i*self.num_kernels+j](x)
281
+ else:
282
+ xs += self.resblocks[i*self.num_kernels+j](x)
283
+ x = xs / self.num_kernels
284
+ x = F.leaky_relu(x)
285
+ x = self.conv_post(x)
286
+ x = torch.tanh(x)
287
+
288
+ return x
289
+
290
+ def remove_weight_norm(self):
291
+ print('Removing weight norm...')
292
+ for l in self.ups:
293
+ remove_weight_norm(l)
294
+ for l in self.resblocks:
295
+ l.remove_weight_norm()
296
+
297
+
298
+ class DiscriminatorP(torch.nn.Module):
299
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
300
+ super(DiscriminatorP, self).__init__()
301
+ self.period = period
302
+ self.use_spectral_norm = use_spectral_norm
303
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
304
+ self.convs = nn.ModuleList([
305
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
306
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
307
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
308
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
309
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
310
+ ])
311
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
312
+
313
+ def forward(self, x):
314
+ fmap = []
315
+
316
+ # 1d to 2d
317
+ b, c, t = x.shape
318
+ if t % self.period != 0: # pad first
319
+ n_pad = self.period - (t % self.period)
320
+ x = F.pad(x, (0, n_pad), "reflect")
321
+ t = t + n_pad
322
+ x = x.view(b, c, t // self.period, self.period)
323
+
324
+ for l in self.convs:
325
+ x = l(x)
326
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
327
+ fmap.append(x)
328
+ x = self.conv_post(x)
329
+ fmap.append(x)
330
+ x = torch.flatten(x, 1, -1)
331
+
332
+ return x, fmap
333
+
334
+
335
+ class DiscriminatorS(torch.nn.Module):
336
+ def __init__(self, use_spectral_norm=False):
337
+ super(DiscriminatorS, self).__init__()
338
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
339
+ self.convs = nn.ModuleList([
340
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
341
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
342
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
343
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
344
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
345
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
346
+ ])
347
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
348
+
349
+ def forward(self, x):
350
+ fmap = []
351
+
352
+ for l in self.convs:
353
+ x = l(x)
354
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
355
+ fmap.append(x)
356
+ x = self.conv_post(x)
357
+ fmap.append(x)
358
+ x = torch.flatten(x, 1, -1)
359
+
360
+ return x, fmap
361
+
362
+
363
+ class MultiPeriodDiscriminator(torch.nn.Module):
364
+ def __init__(self, use_spectral_norm=False):
365
+ super(MultiPeriodDiscriminator, self).__init__()
366
+ periods = [2,3,5,7,11]
367
+
368
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
369
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
370
+ self.discriminators = nn.ModuleList(discs)
371
+
372
+ def forward(self, y, y_hat):
373
+ y_d_rs = []
374
+ y_d_gs = []
375
+ fmap_rs = []
376
+ fmap_gs = []
377
+ for i, d in enumerate(self.discriminators):
378
+ y_d_r, fmap_r = d(y)
379
+ y_d_g, fmap_g = d(y_hat)
380
+ y_d_rs.append(y_d_r)
381
+ y_d_gs.append(y_d_g)
382
+ fmap_rs.append(fmap_r)
383
+ fmap_gs.append(fmap_g)
384
+
385
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
386
+
387
+
388
+
389
+ class SynthesizerTrn(nn.Module):
390
+ """
391
+ Synthesizer for Training
392
+ """
393
+
394
+ def __init__(self,
395
+ n_vocab,
396
+ spec_channels,
397
+ segment_size,
398
+ inter_channels,
399
+ hidden_channels,
400
+ filter_channels,
401
+ n_heads,
402
+ n_layers,
403
+ kernel_size,
404
+ p_dropout,
405
+ resblock,
406
+ resblock_kernel_sizes,
407
+ resblock_dilation_sizes,
408
+ upsample_rates,
409
+ upsample_initial_channel,
410
+ upsample_kernel_sizes,
411
+ n_speakers=0,
412
+ gin_channels=0,
413
+ use_sdp=True,
414
+ **kwargs):
415
+
416
+ super().__init__()
417
+ self.n_vocab = n_vocab
418
+ self.spec_channels = spec_channels
419
+ self.inter_channels = inter_channels
420
+ self.hidden_channels = hidden_channels
421
+ self.filter_channels = filter_channels
422
+ self.n_heads = n_heads
423
+ self.n_layers = n_layers
424
+ self.kernel_size = kernel_size
425
+ self.p_dropout = p_dropout
426
+ self.resblock = resblock
427
+ self.resblock_kernel_sizes = resblock_kernel_sizes
428
+ self.resblock_dilation_sizes = resblock_dilation_sizes
429
+ self.upsample_rates = upsample_rates
430
+ self.upsample_initial_channel = upsample_initial_channel
431
+ self.upsample_kernel_sizes = upsample_kernel_sizes
432
+ self.segment_size = segment_size
433
+ self.n_speakers = n_speakers
434
+ self.gin_channels = gin_channels
435
+
436
+ self.use_sdp = use_sdp
437
+
438
+ self.enc_p = TextEncoder(n_vocab,
439
+ inter_channels,
440
+ hidden_channels,
441
+ filter_channels,
442
+ n_heads,
443
+ n_layers,
444
+ kernel_size,
445
+ p_dropout)
446
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
447
+ self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
448
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
449
+
450
+ if use_sdp:
451
+ self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
452
+ else:
453
+ self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
454
+
455
+ if n_speakers > 1:
456
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
457
+
458
+ def forward(self, x, x_lengths, y, y_lengths, sid=None):
459
+
460
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
461
+ if self.n_speakers > 0:
462
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
463
+ else:
464
+ g = None
465
+
466
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
467
+ z_p = self.flow(z, y_mask, g=g)
468
+
469
+ with torch.no_grad():
470
+ # negative cross-entropy
471
+ s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
472
+ neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
473
+ neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
474
+ neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
475
+ neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
476
+ neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
477
+
478
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
479
+ attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
480
+
481
+ w = attn.sum(2)
482
+ if self.use_sdp:
483
+ l_length = self.dp(x, x_mask, w, g=g)
484
+ l_length = l_length / torch.sum(x_mask)
485
+ else:
486
+ logw_ = torch.log(w + 1e-6) * x_mask
487
+ logw = self.dp(x, x_mask, g=g)
488
+ l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
489
+
490
+ # expand prior
491
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
492
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
493
+
494
+ z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
495
+ o = self.dec(z_slice, g=g)
496
+ return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
497
+
498
+ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
499
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
500
+ if self.n_speakers > 0:
501
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
502
+ else:
503
+ g = None
504
+
505
+ if self.use_sdp:
506
+ logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
507
+ else:
508
+ logw = self.dp(x, x_mask, g=g)
509
+ w = torch.exp(logw) * x_mask * length_scale
510
+ w_ceil = torch.ceil(w)
511
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
512
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
513
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
514
+ attn = commons.generate_path(w_ceil, attn_mask)
515
+
516
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
517
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
518
+
519
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
520
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
521
+ o = self.dec((z * y_mask)[:,:,:max_len], g=g)
522
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
523
+
524
+ def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
525
+ assert self.n_speakers > 0, "n_speakers have to be larger than 0."
526
+ g_src = self.emb_g(sid_src).unsqueeze(-1)
527
+ g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
528
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
529
+ z_p = self.flow(z, y_mask, g=g_src)
530
+ z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
531
+ o_hat = self.dec(z_hat * y_mask, g=g_tgt)
532
+ return o_hat, y_mask, (z, z_p, z_hat)
533
+
VITS-MODELS/modules.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
8
+ from torch.nn.utils import weight_norm, remove_weight_norm
9
+
10
+ import commons
11
+ from commons import init_weights, get_padding
12
+ from transforms import piecewise_rational_quadratic_transform
13
+
14
+
15
+ LRELU_SLOPE = 0.1
16
+
17
+
18
+ class LayerNorm(nn.Module):
19
+ def __init__(self, channels, eps=1e-5):
20
+ super().__init__()
21
+ self.channels = channels
22
+ self.eps = eps
23
+
24
+ self.gamma = nn.Parameter(torch.ones(channels))
25
+ self.beta = nn.Parameter(torch.zeros(channels))
26
+
27
+ def forward(self, x):
28
+ x = x.transpose(1, -1)
29
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
30
+ return x.transpose(1, -1)
31
+
32
+
33
+ class ConvReluNorm(nn.Module):
34
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
35
+ super().__init__()
36
+ self.in_channels = in_channels
37
+ self.hidden_channels = hidden_channels
38
+ self.out_channels = out_channels
39
+ self.kernel_size = kernel_size
40
+ self.n_layers = n_layers
41
+ self.p_dropout = p_dropout
42
+ assert n_layers > 1, "Number of layers should be larger than 0."
43
+
44
+ self.conv_layers = nn.ModuleList()
45
+ self.norm_layers = nn.ModuleList()
46
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
47
+ self.norm_layers.append(LayerNorm(hidden_channels))
48
+ self.relu_drop = nn.Sequential(
49
+ nn.ReLU(),
50
+ nn.Dropout(p_dropout))
51
+ for _ in range(n_layers-1):
52
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
53
+ self.norm_layers.append(LayerNorm(hidden_channels))
54
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
55
+ self.proj.weight.data.zero_()
56
+ self.proj.bias.data.zero_()
57
+
58
+ def forward(self, x, x_mask):
59
+ x_org = x
60
+ for i in range(self.n_layers):
61
+ x = self.conv_layers[i](x * x_mask)
62
+ x = self.norm_layers[i](x)
63
+ x = self.relu_drop(x)
64
+ x = x_org + self.proj(x)
65
+ return x * x_mask
66
+
67
+
68
+ class DDSConv(nn.Module):
69
+ """
70
+ Dialted and Depth-Separable Convolution
71
+ """
72
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
73
+ super().__init__()
74
+ self.channels = channels
75
+ self.kernel_size = kernel_size
76
+ self.n_layers = n_layers
77
+ self.p_dropout = p_dropout
78
+
79
+ self.drop = nn.Dropout(p_dropout)
80
+ self.convs_sep = nn.ModuleList()
81
+ self.convs_1x1 = nn.ModuleList()
82
+ self.norms_1 = nn.ModuleList()
83
+ self.norms_2 = nn.ModuleList()
84
+ for i in range(n_layers):
85
+ dilation = kernel_size ** i
86
+ padding = (kernel_size * dilation - dilation) // 2
87
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
88
+ groups=channels, dilation=dilation, padding=padding
89
+ ))
90
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
91
+ self.norms_1.append(LayerNorm(channels))
92
+ self.norms_2.append(LayerNorm(channels))
93
+
94
+ def forward(self, x, x_mask, g=None):
95
+ if g is not None:
96
+ x = x + g
97
+ for i in range(self.n_layers):
98
+ y = self.convs_sep[i](x * x_mask)
99
+ y = self.norms_1[i](y)
100
+ y = F.gelu(y)
101
+ y = self.convs_1x1[i](y)
102
+ y = self.norms_2[i](y)
103
+ y = F.gelu(y)
104
+ y = self.drop(y)
105
+ x = x + y
106
+ return x * x_mask
107
+
108
+
109
+ class WN(torch.nn.Module):
110
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
111
+ super(WN, self).__init__()
112
+ assert(kernel_size % 2 == 1)
113
+ self.hidden_channels =hidden_channels
114
+ self.kernel_size = kernel_size,
115
+ self.dilation_rate = dilation_rate
116
+ self.n_layers = n_layers
117
+ self.gin_channels = gin_channels
118
+ self.p_dropout = p_dropout
119
+
120
+ self.in_layers = torch.nn.ModuleList()
121
+ self.res_skip_layers = torch.nn.ModuleList()
122
+ self.drop = nn.Dropout(p_dropout)
123
+
124
+ if gin_channels != 0:
125
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
126
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
127
+
128
+ for i in range(n_layers):
129
+ dilation = dilation_rate ** i
130
+ padding = int((kernel_size * dilation - dilation) / 2)
131
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
132
+ dilation=dilation, padding=padding)
133
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
134
+ self.in_layers.append(in_layer)
135
+
136
+ # last one is not necessary
137
+ if i < n_layers - 1:
138
+ res_skip_channels = 2 * hidden_channels
139
+ else:
140
+ res_skip_channels = hidden_channels
141
+
142
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
143
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
144
+ self.res_skip_layers.append(res_skip_layer)
145
+
146
+ def forward(self, x, x_mask, g=None, **kwargs):
147
+ output = torch.zeros_like(x)
148
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
149
+
150
+ if g is not None:
151
+ g = self.cond_layer(g)
152
+
153
+ for i in range(self.n_layers):
154
+ x_in = self.in_layers[i](x)
155
+ if g is not None:
156
+ cond_offset = i * 2 * self.hidden_channels
157
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
158
+ else:
159
+ g_l = torch.zeros_like(x_in)
160
+
161
+ acts = commons.fused_add_tanh_sigmoid_multiply(
162
+ x_in,
163
+ g_l,
164
+ n_channels_tensor)
165
+ acts = self.drop(acts)
166
+
167
+ res_skip_acts = self.res_skip_layers[i](acts)
168
+ if i < self.n_layers - 1:
169
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
170
+ x = (x + res_acts) * x_mask
171
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
172
+ else:
173
+ output = output + res_skip_acts
174
+ return output * x_mask
175
+
176
+ def remove_weight_norm(self):
177
+ if self.gin_channels != 0:
178
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
179
+ for l in self.in_layers:
180
+ torch.nn.utils.remove_weight_norm(l)
181
+ for l in self.res_skip_layers:
182
+ torch.nn.utils.remove_weight_norm(l)
183
+
184
+
185
+ class ResBlock1(torch.nn.Module):
186
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
187
+ super(ResBlock1, self).__init__()
188
+ self.convs1 = nn.ModuleList([
189
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
190
+ padding=get_padding(kernel_size, dilation[0]))),
191
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
192
+ padding=get_padding(kernel_size, dilation[1]))),
193
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
194
+ padding=get_padding(kernel_size, dilation[2])))
195
+ ])
196
+ self.convs1.apply(init_weights)
197
+
198
+ self.convs2 = nn.ModuleList([
199
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
200
+ padding=get_padding(kernel_size, 1))),
201
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
202
+ padding=get_padding(kernel_size, 1))),
203
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
204
+ padding=get_padding(kernel_size, 1)))
205
+ ])
206
+ self.convs2.apply(init_weights)
207
+
208
+ def forward(self, x, x_mask=None):
209
+ for c1, c2 in zip(self.convs1, self.convs2):
210
+ xt = F.leaky_relu(x, LRELU_SLOPE)
211
+ if x_mask is not None:
212
+ xt = xt * x_mask
213
+ xt = c1(xt)
214
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
215
+ if x_mask is not None:
216
+ xt = xt * x_mask
217
+ xt = c2(xt)
218
+ x = xt + x
219
+ if x_mask is not None:
220
+ x = x * x_mask
221
+ return x
222
+
223
+ def remove_weight_norm(self):
224
+ for l in self.convs1:
225
+ remove_weight_norm(l)
226
+ for l in self.convs2:
227
+ remove_weight_norm(l)
228
+
229
+
230
+ class ResBlock2(torch.nn.Module):
231
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
232
+ super(ResBlock2, self).__init__()
233
+ self.convs = nn.ModuleList([
234
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
235
+ padding=get_padding(kernel_size, dilation[0]))),
236
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
237
+ padding=get_padding(kernel_size, dilation[1])))
238
+ ])
239
+ self.convs.apply(init_weights)
240
+
241
+ def forward(self, x, x_mask=None):
242
+ for c in self.convs:
243
+ xt = F.leaky_relu(x, LRELU_SLOPE)
244
+ if x_mask is not None:
245
+ xt = xt * x_mask
246
+ xt = c(xt)
247
+ x = xt + x
248
+ if x_mask is not None:
249
+ x = x * x_mask
250
+ return x
251
+
252
+ def remove_weight_norm(self):
253
+ for l in self.convs:
254
+ remove_weight_norm(l)
255
+
256
+
257
+ class Log(nn.Module):
258
+ def forward(self, x, x_mask, reverse=False, **kwargs):
259
+ if not reverse:
260
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
261
+ logdet = torch.sum(-y, [1, 2])
262
+ return y, logdet
263
+ else:
264
+ x = torch.exp(x) * x_mask
265
+ return x
266
+
267
+
268
+ class Flip(nn.Module):
269
+ def forward(self, x, *args, reverse=False, **kwargs):
270
+ x = torch.flip(x, [1])
271
+ if not reverse:
272
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
273
+ return x, logdet
274
+ else:
275
+ return x
276
+
277
+
278
+ class ElementwiseAffine(nn.Module):
279
+ def __init__(self, channels):
280
+ super().__init__()
281
+ self.channels = channels
282
+ self.m = nn.Parameter(torch.zeros(channels,1))
283
+ self.logs = nn.Parameter(torch.zeros(channels,1))
284
+
285
+ def forward(self, x, x_mask, reverse=False, **kwargs):
286
+ if not reverse:
287
+ y = self.m + torch.exp(self.logs) * x
288
+ y = y * x_mask
289
+ logdet = torch.sum(self.logs * x_mask, [1,2])
290
+ return y, logdet
291
+ else:
292
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
293
+ return x
294
+
295
+
296
+ class ResidualCouplingLayer(nn.Module):
297
+ def __init__(self,
298
+ channels,
299
+ hidden_channels,
300
+ kernel_size,
301
+ dilation_rate,
302
+ n_layers,
303
+ p_dropout=0,
304
+ gin_channels=0,
305
+ mean_only=False):
306
+ assert channels % 2 == 0, "channels should be divisible by 2"
307
+ super().__init__()
308
+ self.channels = channels
309
+ self.hidden_channels = hidden_channels
310
+ self.kernel_size = kernel_size
311
+ self.dilation_rate = dilation_rate
312
+ self.n_layers = n_layers
313
+ self.half_channels = channels // 2
314
+ self.mean_only = mean_only
315
+
316
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
317
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
318
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
319
+ self.post.weight.data.zero_()
320
+ self.post.bias.data.zero_()
321
+
322
+ def forward(self, x, x_mask, g=None, reverse=False):
323
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
324
+ h = self.pre(x0) * x_mask
325
+ h = self.enc(h, x_mask, g=g)
326
+ stats = self.post(h) * x_mask
327
+ if not self.mean_only:
328
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
329
+ else:
330
+ m = stats
331
+ logs = torch.zeros_like(m)
332
+
333
+ if not reverse:
334
+ x1 = m + x1 * torch.exp(logs) * x_mask
335
+ x = torch.cat([x0, x1], 1)
336
+ logdet = torch.sum(logs, [1,2])
337
+ return x, logdet
338
+ else:
339
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
340
+ x = torch.cat([x0, x1], 1)
341
+ return x
342
+
343
+
344
+ class ConvFlow(nn.Module):
345
+ def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
346
+ super().__init__()
347
+ self.in_channels = in_channels
348
+ self.filter_channels = filter_channels
349
+ self.kernel_size = kernel_size
350
+ self.n_layers = n_layers
351
+ self.num_bins = num_bins
352
+ self.tail_bound = tail_bound
353
+ self.half_channels = in_channels // 2
354
+
355
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
356
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
357
+ self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
358
+ self.proj.weight.data.zero_()
359
+ self.proj.bias.data.zero_()
360
+
361
+ def forward(self, x, x_mask, g=None, reverse=False):
362
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
363
+ h = self.pre(x0)
364
+ h = self.convs(h, x_mask, g=g)
365
+ h = self.proj(h) * x_mask
366
+
367
+ b, c, t = x0.shape
368
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
369
+
370
+ unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
371
+ unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
372
+ unnormalized_derivatives = h[..., 2 * self.num_bins:]
373
+
374
+ x1, logabsdet = piecewise_rational_quadratic_transform(x1,
375
+ unnormalized_widths,
376
+ unnormalized_heights,
377
+ unnormalized_derivatives,
378
+ inverse=reverse,
379
+ tails='linear',
380
+ tail_bound=self.tail_bound
381
+ )
382
+
383
+ x = torch.cat([x0, x1], 1) * x_mask
384
+ logdet = torch.sum(logabsdet * x_mask, [1,2])
385
+ if not reverse:
386
+ return x, logdet
387
+ else:
388
+ return x
VITS-MODELS/monotonic_align/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numpy import zeros, int32, float32
2
+ from torch import from_numpy
3
+
4
+ from .core import maximum_path_jit
5
+
6
+
7
+ def maximum_path(neg_cent, mask):
8
+ """ numba optimized version.
9
+ neg_cent: [b, t_t, t_s]
10
+ mask: [b, t_t, t_s]
11
+ """
12
+ device = neg_cent.device
13
+ dtype = neg_cent.dtype
14
+ neg_cent = neg_cent.data.cpu().numpy().astype(float32)
15
+ path = zeros(neg_cent.shape, dtype=int32)
16
+
17
+ t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
18
+ t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
19
+ maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
20
+ return from_numpy(path).to(device=device, dtype=dtype)
VITS-MODELS/monotonic_align/core.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numba
2
+
3
+
4
+ @numba.jit(numba.void(numba.int32[:, :, ::1], numba.float32[:, :, ::1], numba.int32[::1], numba.int32[::1]),
5
+ nopython=True, nogil=True)
6
+ def maximum_path_jit(paths, values, t_ys, t_xs):
7
+ b = paths.shape[0]
8
+ max_neg_val = -1e9
9
+ for i in range(int(b)):
10
+ path = paths[i]
11
+ value = values[i]
12
+ t_y = t_ys[i]
13
+ t_x = t_xs[i]
14
+
15
+ v_prev = v_cur = 0.0
16
+ index = t_x - 1
17
+
18
+ for y in range(t_y):
19
+ for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
20
+ if x == y:
21
+ v_cur = max_neg_val
22
+ else:
23
+ v_cur = value[y - 1, x]
24
+ if x == 0:
25
+ if y == 0:
26
+ v_prev = 0.
27
+ else:
28
+ v_prev = max_neg_val
29
+ else:
30
+ v_prev = value[y - 1, x - 1]
31
+ value[y, x] += max(v_prev, v_cur)
32
+
33
+ for y in range(t_y - 1, -1, -1):
34
+ path[y, index] = 1
35
+ if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]):
36
+ index = index - 1
VITS-MODELS/pretrained_models/abyssinvoker/abyssinvoker.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7f20e5575d3f265c3b6a092751253df52726dd9b5d09252c267746c882698ea
3
+ size 159706189
VITS-MODELS/pretrained_models/abyssinvoker/cover.png ADDED

Git LFS Details

  • SHA256: d728a8defd19ccd2eb4fb5dde7ead280f7dfbe5f808257e84004588dbfae67d4
  • Pointer size: 131 Bytes
  • Size of remote file: 999 kB
VITS-MODELS/pretrained_models/alice/alice.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38ec9a70b25319dd05884242cf84c08ac337e27534f56fc90c5f5460251fb561
3
+ size 159706189
VITS-MODELS/pretrained_models/alice/cover.png ADDED

Git LFS Details

  • SHA256: 7f25930c9fe0e874ae34e8a21b12215dced4c05a6c21e93dd76a2dae8cc4b575
  • Pointer size: 131 Bytes
  • Size of remote file: 743 kB
VITS-MODELS/pretrained_models/ameth/ameth.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293a2d89ad400edfc9279bcf1cb3b32cfda484babdcafa71cabcfa1d1cad8495
3
+ size 145471413
VITS-MODELS/pretrained_models/ameth/cover.png ADDED

Git LFS Details

  • SHA256: 2d6654056fcf4e04dce92cb50e15fcdd88b56e5ec08b701959f162dd01bd2eca
  • Pointer size: 131 Bytes
  • Size of remote file: 340 kB
VITS-MODELS/pretrained_models/asuna/asuna.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cdb530c944533c68a6169edb3acdedb1f3bd17e6e4a3dc71285c870e575a36a
3
+ size 159706189
VITS-MODELS/pretrained_models/asuna/cover.png ADDED

Git LFS Details

  • SHA256: 829399678e075e881878af8866569b9291a30c0fc5fcf3d65716d5691ca80bc4
  • Pointer size: 131 Bytes
  • Size of remote file: 133 kB
VITS-MODELS/pretrained_models/ayaka-jp/ayaka-jp.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa4dcf1ba1d782f85dcaf832ef4903872e801dee501f20cf7570841f6e4a0180
3
+ size 159706189
VITS-MODELS/pretrained_models/ayaka-jp/cover.png ADDED

Git LFS Details

  • SHA256: 1fe85d2c9895d4d0010660eb42ffa154edb7e0decc97f4444ba6009c69d029c0
  • Pointer size: 131 Bytes
  • Size of remote file: 838 kB
VITS-MODELS/pretrained_models/azusa/azusa.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b29524ef3d13c8dfec9f1755fc2fb753ee40c2e313c1b7dccb2f805d57264911
3
+ size 159706189
VITS-MODELS/pretrained_models/azusa/cover.png ADDED

Git LFS Details

  • SHA256: 412997d10077e7d3b85e274f63c15396c1eaa7c5033d131de0037b796c98c9c4
  • Pointer size: 131 Bytes
  • Size of remote file: 780 kB
VITS-MODELS/pretrained_models/bronya/bronya.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03b67727a8cf31767d2ee266e86f03e6ec61aebe23a16663cf712ba22f0eaab5
3
+ size 159706189
VITS-MODELS/pretrained_models/bronya/cover.png ADDED
VITS-MODELS/pretrained_models/chisato/chisato.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bfe2195080dd81503946f5d24f20ac0e337263a736c37dca22b54240bc310de
3
+ size 145471413
VITS-MODELS/pretrained_models/chisato/cover.png ADDED

Git LFS Details

  • SHA256: d4e261e412a09e4c50281dd8430c8bef88c07cfbd9bce6803ae2f040bdc30edc
  • Pointer size: 131 Bytes
  • Size of remote file: 132 kB
VITS-MODELS/pretrained_models/doom/cover.png ADDED

Git LFS Details

  • SHA256: 5764d61453ed097338103b53fb7c5b0ced488db3a7806b35d913652cc262b5aa
  • Pointer size: 132 Bytes
  • Size of remote file: 1.03 MB
VITS-MODELS/pretrained_models/doom/doom.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee6f598ec8333f1dfd1e890a67e85ed3b2eaaf35f99f8eca02f4db8c70ab8d2d
3
+ size 159706189
VITS-MODELS/pretrained_models/echo/cover.png ADDED

Git LFS Details

  • SHA256: 8d818c853789241fd98e60fbabcc27b960bfff2e42fa28a4cafda10c7c90a76d
  • Pointer size: 131 Bytes
  • Size of remote file: 625 kB
VITS-MODELS/pretrained_models/echo/echo.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7050e1f5d8ed5dce02379aa77040cf4b3435e0c294b9260dd418c3072bd4ac70
3
+ size 159706189
VITS-MODELS/pretrained_models/eriko/cover.png ADDED

Git LFS Details

  • SHA256: 6f9d64c8882913c527d02db31fdafc56ff307f492a55ee20231b6378045497ef
  • Pointer size: 131 Bytes
  • Size of remote file: 136 kB
VITS-MODELS/pretrained_models/eriko/eriko.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f211bfcad2d0512a8756002b14c4e5d2ed72ff7e0d0ac4fd038182fda696f81
3
+ size 145471477
VITS-MODELS/pretrained_models/eula/cover.png ADDED

Git LFS Details

  • SHA256: caa943e77bdc7746ea467271dfeced7eb7dd65ff5c7c51b37b3ab083a6b0149d
  • Pointer size: 131 Bytes
  • Size of remote file: 118 kB
VITS-MODELS/pretrained_models/eula/eula.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbfaf3e5dbdca404efaf761cddd60630112bea395791accc96bff4b03039bac3
3
+ size 159706189
VITS-MODELS/pretrained_models/hatsune/cover.png ADDED

Git LFS Details

  • SHA256: 1dc5d6e3cbc07df258eef6dd2c62ca31a5023cc802a4be57c5aaf8f99d72a63c
  • Pointer size: 131 Bytes
  • Size of remote file: 292 kB
VITS-MODELS/pretrained_models/hatsune/hatsune.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93ff23ba52e7c6675d3a63baeb9531f5706a85666739c3378d4d0f1dc7b46df3
3
+ size 159706189
VITS-MODELS/pretrained_models/herta/cover.png ADDED

Git LFS Details

  • SHA256: efa7fc0645d94a5ed736e8735096fcc86730a731b396f6380dca9638abd14db8
  • Pointer size: 131 Bytes
  • Size of remote file: 803 kB
VITS-MODELS/pretrained_models/herta/herta.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed430483007dc7f1d5eb93c26200c20655683672ff2f2eeadaeef75edbb8bf56
3
+ size 159650901
VITS-MODELS/pretrained_models/hina/cover.png ADDED

Git LFS Details

  • SHA256: bd750f9f0cd55ef0bdaade7c3e50642d755c7aae9efd59cae40f190da6738f5f
  • Pointer size: 131 Bytes
  • Size of remote file: 171 kB
VITS-MODELS/pretrained_models/hina/hina.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f1aafbf4ab512ffd076434f27ab8375709dd22eaf9e7ca5d2d0c96f8ccbde95
3
+ size 159706189
VITS-MODELS/pretrained_models/hiyori/cover.png ADDED

Git LFS Details

  • SHA256: 353df61b19740227b4d3560b89e0b7953ac881d63c92b29bbcb8cabbe06b42a7
  • Pointer size: 131 Bytes
  • Size of remote file: 207 kB
VITS-MODELS/pretrained_models/hiyori/hiyori.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52168d7d2d015236d1549bffc808b4084f7c3ee722893aec0ec0e4f560e421b9
3
+ size 145471413
VITS-MODELS/pretrained_models/hoshino/cover.png ADDED

Git LFS Details

  • SHA256: 2844b2d7d6572290d0a50664ce7c3bc690eecf33471f4e06160b176bda940fc4
  • Pointer size: 131 Bytes
  • Size of remote file: 852 kB
VITS-MODELS/pretrained_models/hoshino/hoshino.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3510c37149fd6ecf791b0c056c6cb87ce3c5a53e06a240ca8aa48ddfe96b57b
3
+ size 159706189
VITS-MODELS/pretrained_models/info.json ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "kafka": {
3
+ "enable": true,
4
+ "name_en": "kafka",
5
+ "name_zh": "卡芙卡",
6
+ "title": "Honkai: Star Rail-カフカ",
7
+ "cover": "cover.png",
8
+ "sid": 10,
9
+ "example": "すずめの戸締り、みんなは何回みた〜?",
10
+ "language": "Japanese",
11
+ "type": "multi"
12
+ },
13
+ "herta": {
14
+ "enable": true,
15
+ "name_en": "herta",
16
+ "name_zh": "黑塔",
17
+ "title": "Honkai: Star Rail-ヘルタ",
18
+ "cover": "cover.png",
19
+ "sid": 10,
20
+ "example": "すずめの戸締り、みんなは何回みた〜?",
21
+ "language": "Japanese",
22
+ "type": "multi"
23
+ },
24
+ "momoi": {
25
+ "enable": true,
26
+ "name_en": "Saiba Momoi",
27
+ "name_zh": "才羽桃井",
28
+ "title": "Blue Archive-才羽モモイ",
29
+ "cover": "cover.png",
30
+ "sid": 10,
31
+ "example": "すずめの戸締り、みんなは何回みた〜?",
32
+ "language": "Japanese",
33
+ "type": "multi"
34
+ },
35
+ "yuni": {
36
+ "enable": true,
37
+ "name_en": "Yuni",
38
+ "name_zh": "优妮",
39
+ "title": "Princess Connect! Re:Dive-ユニ",
40
+ "cover": "cover.png",
41
+ "sid": 0,
42
+ "example": "それに新しいお菓子屋さんも出来てみんな買いものを楽しんでいます!",
43
+ "language": "Japanese",
44
+ "type": "single"
45
+ },
46
+ "misora": {
47
+ "enable": true,
48
+ "name_en": "Misora",
49
+ "name_zh": "美空",
50
+ "title": "Princess Connect! Re:Dive-ミソラ",
51
+ "cover": "cover.png",
52
+ "sid": 0,
53
+ "example": "ただいま。お詫びにお前の好きなケーキを買ってきてやったから、一緒に喰おうな?",
54
+ "language": "Japanese",
55
+ "type": "single"
56
+ },
57
+ "kyoka": {
58
+ "enable": true,
59
+ "name_en": "Kyoka",
60
+ "name_zh": "镜华",
61
+ "title": "Princess Connect! Re:Dive-キョウカ",
62
+ "cover": "cover.png",
63
+ "sid": 0,
64
+ "example": "まだまだ領内の発展に落ち着きは生まれそうにないわね",
65
+ "language": "Japanese",
66
+ "type": "multi"
67
+ },
68
+ "hiyori": {
69
+ "enable": true,
70
+ "name_en": "Hiyori",
71
+ "name_zh": "日和莉",
72
+ "title": "Princess Connect! Re:Dive-ヒヨリ",
73
+ "cover": "cover.png",
74
+ "sid": 0,
75
+ "example": "今日はいい天気ですね!",
76
+ "language": "Japanese",
77
+ "type": "single"
78
+ },
79
+ "ameth": {
80
+ "enable": true,
81
+ "name_en": "Ameth",
82
+ "name_zh": "爱梅斯",
83
+ "title": "Princess Connect! Re:Dive-アメス",
84
+ "cover": "cover.png",
85
+ "sid": 0,
86
+ "example": "きょうは高気圧に緩やかに覆われるでしょう。沖縄と九州から北海道にかけて晴れる所が多くなりそうです。",
87
+ "language": "Japanese",
88
+ "type": "single"
89
+ },
90
+ "hatsune": {
91
+ "enable": true,
92
+ "name_en": "Hatsune",
93
+ "name_zh": "柏崎初音",
94
+ "title": "Princess Connect! Re:Dive-柏崎初音",
95
+ "cover": "cover.png",
96
+ "sid": 10,
97
+ "example": "バトルでの役割や立ち回りをチェックしてみてくださいね!",
98
+ "language": "Japanese",
99
+ "type": "multi"
100
+ },
101
+ "eriko": {
102
+ "enable": true,
103
+ "name_en": "Eriko",
104
+ "name_zh": "惠理子",
105
+ "title": "Princess Connect! Re:Dive-倉石恵理子",
106
+ "cover": "cover.png",
107
+ "sid": 0,
108
+ "example": "プリンセスコネクト",
109
+ "language": "Japanese",
110
+ "type": "single"
111
+ },
112
+ "pecorine": {
113
+ "enable": true,
114
+ "name_en": "Pecorine",
115
+ "name_zh": "佩可莉姆",
116
+ "title": "Princess Connect! Re:Dive-ペコリーヌ",
117
+ "cover": "cover.png",
118
+ "sid": 10,
119
+ "example": "今日はいい天気ですね!",
120
+ "language": "Japanese",
121
+ "type": "multi"
122
+ },
123
+ "kokoro": {
124
+ "enable": true,
125
+ "name_en": "Kokoro",
126
+ "name_zh": "可可萝",
127
+ "title": "Princess Connect! Re:Dive-棗こころ",
128
+ "cover": "cover.png",
129
+ "sid": 0,
130
+ "example": "今日はいい天気ですね。",
131
+ "language": "Japanese",
132
+ "type": "single"
133
+ },
134
+ "kyaru": {
135
+ "enable": true,
136
+ "name_en": "Kyaru",
137
+ "name_zh": "凯露",
138
+ "title": "Princess Connect! Re:Dive-キャル",
139
+ "cover": "cover.png",
140
+ "sid": 10,
141
+ "example": "今日はいい天気ですね!!",
142
+ "language": "Japanese",
143
+ "type": "multi"
144
+ },
145
+ "chisato": {
146
+ "enable": true,
147
+ "name_en": "Nishikigi Chisato",
148
+ "name_zh": "锦木千束",
149
+ "title": "Lycoris Recoil-錦木千束",
150
+ "cover": "cover.png",
151
+ "sid": 0,
152
+ "example": "今日はいい天気ですね。",
153
+ "language": "Japanese",
154
+ "type": "single"
155
+ },
156
+ "takina": {
157
+ "enable": true,
158
+ "name_en": "Takina Inoue",
159
+ "name_zh": "井上泷奈",
160
+ "title": "Lycoris Recoil-井ノ上たきな",
161
+ "cover": "cover.png",
162
+ "sid": 0,
163
+ "example": "今日はいい天気ですね。",
164
+ "language": "Japanese",
165
+ "type": "single"
166
+ },
167
+ "ayaka-jp": {
168
+ "enable": true,
169
+ "name_en": "ayaka-jp",
170
+ "name_zh": "神里绫华-日语",
171
+ "title": "Genshin Impact-神里綾華",
172
+ "cover": "cover.png",
173
+ "sid": 303,
174
+ "example": "今日はいい天気ですね。",
175
+ "language": "Japanese",
176
+ "type": "multi"
177
+ },
178
+ "nahida-jp": {
179
+ "enable": true,
180
+ "name_en": "nahida-jp",
181
+ "name_zh": "纳西妲-日语",
182
+ "title": "Genshin Impact-ナヒーダ",
183
+ "cover": "cover.png",
184
+ "sid": 0,
185
+ "example": "今日はいい天気ですね。",
186
+ "language": "Japanese",
187
+ "type": "single"
188
+ },
189
+ "iroha": {
190
+ "enable": true,
191
+ "name_en": "Natsume Iroha",
192
+ "name_zh": "枣伊吕波",
193
+ "title": "Blue Archive-棗イロハ,",
194
+ "cover": "cover.png",
195
+ "sid": 10,
196
+ "example": "今日はいい天気ですね。",
197
+ "language": "Japanese",
198
+ "type": "multi"
199
+ },
200
+ "mika": {
201
+ "enable": true,
202
+ "name_en": "Misono Mika",
203
+ "name_zh": "圣园未花",
204
+ "title": "Blue Archive-聖園ミカ",
205
+ "cover": "cover.png",
206
+ "sid": 10,
207
+ "example": "今日はいい天気ですね。",
208
+ "language": "Japanese",
209
+ "type": "multi"
210
+ },
211
+ "miyu": {
212
+ "enable": true,
213
+ "name_en": "Kasumizawa Miyu",
214
+ "name_zh": "霞泽美游",
215
+ "title": "Blue Archive-霞沢ミユ",
216
+ "cover": "cover.png",
217
+ "sid": 10,
218
+ "example": "今日はいい天気ですね。",
219
+ "language": "Japanese",
220
+ "type": "multi"
221
+ },
222
+ "karin": {
223
+ "enable": false,
224
+ "name_en": "Kakudate Karin",
225
+ "name_zh": "角楯花凛",
226
+ "title": "Blue Archive-角楯カリン",
227
+ "cover": "cover.png",
228
+ "sid": 10,
229
+ "example": "今日はいい天気ですね。",
230
+ "language": "Japanese",
231
+ "type": "multi"
232
+ },
233
+ "asuna": {
234
+ "enable": false,
235
+ "name_en": "Ichinose Asuna",
236
+ "name_zh": "一之濑明日奈",
237
+ "title": "Blue Archive-一之瀬アスナ",
238
+ "cover": "cover.png",
239
+ "sid": 10,
240
+ "example": "今日はいい天気ですね。",
241
+ "language": "Japanese",
242
+ "type": "multi"
243
+ },
244
+ "azusa": {
245
+ "enable": true,
246
+ "name_en": "Shirasu Azusa",
247
+ "name_zh": "白洲梓",
248
+ "title": "Blue Archive-白洲アズサ",
249
+ "cover": "cover.png",
250
+ "sid": 10,
251
+ "example": "今日はいい天気ですね。",
252
+ "language": "Japanese",
253
+ "type": "multi"
254
+ },
255
+ "alice": {
256
+ "enable": true,
257
+ "name_en": "Tendou Alice",
258
+ "name_zh": "天童爱丽丝",
259
+ "title": "Blue Archive-天童アリス",
260
+ "cover": "cover.png",
261
+ "sid": 10,
262
+ "example": "今日はいい天気ですね。",
263
+ "language": "Japanese",
264
+ "type": "multi"
265
+ },
266
+ "shiroko": {
267
+ "enable": true,
268
+ "name_en": "Sunaookami Shiroko",
269
+ "name_zh": "砂狼白子",
270
+ "title": "Blue Archive-砂狼シロコ",
271
+ "cover": "cover.png",
272
+ "sid": 10,
273
+ "example": "今日はいい天気ですね。",
274
+ "language": "Japanese",
275
+ "type": "multi"
276
+ },
277
+ "hoshino": {
278
+ "enable": false,
279
+ "name_en": "Takanasi Hosino",
280
+ "name_zh": "小鸟游星野",
281
+ "title": "Blue Archive-小鳥遊 ホシノ",
282
+ "cover": "cover.png",
283
+ "sid": 10,
284
+ "example": "今日はいい天気ですね。",
285
+ "language": "Japanese",
286
+ "type": "multi"
287
+ },
288
+ "hina": {
289
+ "enable": true,
290
+ "name_en": "Sorasaki Hina",
291
+ "name_zh": "空崎日奈",
292
+ "title": "Blue Archive-空崎 ヒナ",
293
+ "cover": "cover.png",
294
+ "sid": 10,
295
+ "example": "今日はいい天気ですね。",
296
+ "language": "Japanese",
297
+ "type": "multi"
298
+ },
299
+ "iori": {
300
+ "enable": true,
301
+ "name_en": "Shiromi Iori",
302
+ "name_zh": "银镜伊织",
303
+ "title": "Blue Archive-銀鏡イオリ",
304
+ "cover": "cover.png",
305
+ "sid": 10,
306
+ "example": "今日はいい天気ですね。",
307
+ "language": "Japanese",
308
+ "type": "multi"
309
+ },
310
+ "izuna": {
311
+ "enable": true,
312
+ "name_en": "Kuda Izuna",
313
+ "name_zh": "久田泉奈",
314
+ "title": "Blue Archive-久田イズナ",
315
+ "cover": "cover.png",
316
+ "sid": 10,
317
+ "example": "今日はいい天気ですね。",
318
+ "language": "Japanese",
319
+ "type": "multi"
320
+ },
321
+ "yuuka": {
322
+ "enable": true,
323
+ "name_en": "Hayase Yuuka",
324
+ "name_zh": "早濑优香",
325
+ "title": "Blue Archive-早瀬ユウカ",
326
+ "cover": "cover.png",
327
+ "sid": 40,
328
+ "example": "今日はいい天気ですね。",
329
+ "language": "Japanese",
330
+ "type": "multi"
331
+ },
332
+ "doom": {
333
+ "enable": true,
334
+ "name_en": "Doomfist",
335
+ "name_zh": "末日铁拳",
336
+ "title": "Overwatch 2-Doomfist",
337
+ "cover": "cover.png",
338
+ "sid": 93,
339
+ "example": "无需等待队列,并且没有长度限制",
340
+ "language": "Chinese",
341
+ "type": "multi"
342
+ },
343
+ "echo": {
344
+ "enable": true,
345
+ "name_en": "Echo",
346
+ "name_zh": "回声",
347
+ "title": "Overwatch 2-Echo",
348
+ "cover": "cover.png",
349
+ "sid": 93,
350
+ "example": "正在复制,派蒙",
351
+ "language": "Chinese",
352
+ "type": "multi"
353
+ },
354
+ "zenyatta": {
355
+ "enable": true,
356
+ "name_en": "Zenyatta",
357
+ "name_zh": "禅雅塔",
358
+ "title": "Overwatch 2-Zenyatta",
359
+ "cover": "cover.png",
360
+ "sid": 93,
361
+ "example": "今天晚上吃啥好呢",
362
+ "language": "Chinese",
363
+ "type": "multi"
364
+ },
365
+ "abyssinvoker": {
366
+ "enable": true,
367
+ "name_en": "Abyss Invoker",
368
+ "name_zh": "深渊使徒",
369
+ "title": "Genshin Impact-深渊使徒",
370
+ "cover": "cover.png",
371
+ "sid": 94,
372
+ "example": "今天晚上吃啥好���",
373
+ "language": "Chinese",
374
+ "type": "multi"
375
+ },
376
+ "keqing": {
377
+ "enable": true,
378
+ "name_en": "Keqing",
379
+ "name_zh": "刻晴",
380
+ "title": "Genshin Impact-刻晴",
381
+ "cover": "cover.png",
382
+ "sid": 115,
383
+ "example": "今天晚上吃啥好呢",
384
+ "language": "Chinese",
385
+ "type": "multi"
386
+ },
387
+ "eula": {
388
+ "enable": true,
389
+ "name_en": "Eula",
390
+ "name_zh": "优菈",
391
+ "title": "Genshin Impact-优菈",
392
+ "cover": "cover.png",
393
+ "sid": 124,
394
+ "example": "今天晚上吃啥好呢",
395
+ "language": "Chinese",
396
+ "type": "multi"
397
+ },
398
+ "bronya": {
399
+ "enable": true,
400
+ "name_en": "Herrscher of Reason",
401
+ "name_zh": "理之律者",
402
+ "title": "Honkai Impact 3rd-理之律者",
403
+ "cover": "cover.png",
404
+ "sid": 193,
405
+ "example": "今天晚上吃啥好呢",
406
+ "language": "Chinese",
407
+ "type": "multi"
408
+ },
409
+ "theresa": {
410
+ "enable": true,
411
+ "name_en": "Theresa",
412
+ "name_zh": "德丽莎",
413
+ "title": "Honkai Impact 3rd-德丽莎",
414
+ "cover": "cover.png",
415
+ "sid": 193,
416
+ "example": "今天晚上吃啥好呢",
417
+ "language": "Chinese",
418
+ "type": "multi"
419
+ }
420
+ }
VITS-MODELS/pretrained_models/iori/cover.png ADDED

Git LFS Details

  • SHA256: 96219c2e57a2fa2aa468c1c45faa49a18ab5066df2338562f31057981ae14b41
  • Pointer size: 131 Bytes
  • Size of remote file: 154 kB
VITS-MODELS/pretrained_models/iori/iori.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e38336227ee7dfce8be61290461cae9396dedd45f90100a43466be9e341749c7
3
+ size 159706189
VITS-MODELS/pretrained_models/iroha/cover.png ADDED
VITS-MODELS/pretrained_models/iroha/iroha.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:045e9171a56078e5e517fc045b25651fa3ce4fa8afd4b1f98ecfc2b8e27cd4f9
3
+ size 159706189
VITS-MODELS/pretrained_models/izuna/cover.png ADDED

Git LFS Details

  • SHA256: def03ec91ea310fd268ddce0303ea36ab0792f30355f6317c3eb4b20f7eb9285
  • Pointer size: 131 Bytes
  • Size of remote file: 691 kB