Spaces:
Runtime error
Runtime error
RyaoChengfeng
commited on
Commit
Β·
79dd817
1
Parent(s):
b5b5597
git lfs
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- .gitattributes +3 -1
- .gitignore +1 -1
- pretrained_models/moe-tts/0/config.json +116 -0
- pretrained_models/moe-tts/0/cover.jpg +3 -0
- pretrained_models/moe-tts/0/model.pth +3 -0
- pretrained_models/moe-tts/1/config.json +35 -0
- pretrained_models/moe-tts/1/cover.jpg +3 -0
- pretrained_models/moe-tts/1/model.pth +3 -0
- pretrained_models/moe-tts/10/config.json +52 -0
- pretrained_models/moe-tts/10/cover.jpg +3 -0
- pretrained_models/moe-tts/10/model.pth +3 -0
- pretrained_models/moe-tts/11/config.json +52 -0
- pretrained_models/moe-tts/11/cover.jpg +3 -0
- pretrained_models/moe-tts/11/model.pth +3 -0
- pretrained_models/moe-tts/12/config.json +35 -0
- pretrained_models/moe-tts/12/cover.jpg +3 -0
- pretrained_models/moe-tts/12/model.pth +3 -0
- pretrained_models/moe-tts/13/config.json +35 -0
- pretrained_models/moe-tts/13/cover.jpg +3 -0
- pretrained_models/moe-tts/13/model.pth +3 -0
- pretrained_models/moe-tts/14/config.json +35 -0
- pretrained_models/moe-tts/14/model.pth +3 -0
- pretrained_models/moe-tts/15/config.json +0 -0
- pretrained_models/moe-tts/15/model.pth +3 -0
- pretrained_models/moe-tts/16/config.json +35 -0
- pretrained_models/moe-tts/16/model.pth +3 -0
- pretrained_models/moe-tts/17/config.json +35 -0
- pretrained_models/moe-tts/17/model.pth +3 -0
- pretrained_models/moe-tts/18/config.json +142 -0
- pretrained_models/moe-tts/18/cover.jpg +3 -0
- pretrained_models/moe-tts/18/model.pth +3 -0
- pretrained_models/moe-tts/2/config.json +36 -0
- pretrained_models/moe-tts/2/cover.jpg +3 -0
- pretrained_models/moe-tts/2/model.pth +3 -0
- pretrained_models/moe-tts/3/config.json +36 -0
- pretrained_models/moe-tts/3/cover.jpg +3 -0
- pretrained_models/moe-tts/3/model.pth +3 -0
- pretrained_models/moe-tts/4/config.json +36 -0
- pretrained_models/moe-tts/4/cover.jpg +3 -0
- pretrained_models/moe-tts/4/model.pth +3 -0
- pretrained_models/moe-tts/5/config.json +35 -0
- pretrained_models/moe-tts/5/cover.jpg +3 -0
- pretrained_models/moe-tts/5/model.pth +3 -0
- pretrained_models/moe-tts/6/config.json +35 -0
- pretrained_models/moe-tts/6/cover.jpg +3 -0
- pretrained_models/moe-tts/6/model.pth +3 -0
- pretrained_models/moe-tts/7/config.json +55 -0
- pretrained_models/moe-tts/7/cover.jpg +3 -0
- pretrained_models/moe-tts/7/model.pth +3 -0
- pretrained_models/moe-tts/8/config.json +35 -0
.gitattributes
CHANGED
@@ -23,7 +23,7 @@
|
|
23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
@@ -31,3 +31,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
31 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
pretrained_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
|
|
31 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -548,4 +548,4 @@ cython_debug/
|
|
548 |
#.idea/
|
549 |
|
550 |
# models
|
551 |
-
|
|
|
548 |
#.idea/
|
549 |
|
550 |
# models
|
551 |
+
#/pretrained_models/*
|
pretrained_models/moe-tts/0/config.json
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners": [
|
7 |
+
"japanese_cleaners"
|
8 |
+
],
|
9 |
+
"max_wav_value": 32768.0,
|
10 |
+
"sampling_rate": 22050,
|
11 |
+
"filter_length": 1024,
|
12 |
+
"hop_length": 256,
|
13 |
+
"win_length": 1024,
|
14 |
+
"add_blank": true,
|
15 |
+
"n_speakers": 7
|
16 |
+
},
|
17 |
+
"model": {
|
18 |
+
"inter_channels": 192,
|
19 |
+
"hidden_channels": 192,
|
20 |
+
"filter_channels": 768,
|
21 |
+
"n_heads": 2,
|
22 |
+
"n_layers": 6,
|
23 |
+
"kernel_size": 3,
|
24 |
+
"p_dropout": 0.1,
|
25 |
+
"resblock": "1",
|
26 |
+
"resblock_kernel_sizes": [
|
27 |
+
3,
|
28 |
+
7,
|
29 |
+
11
|
30 |
+
],
|
31 |
+
"resblock_dilation_sizes": [
|
32 |
+
[
|
33 |
+
1,
|
34 |
+
3,
|
35 |
+
5
|
36 |
+
],
|
37 |
+
[
|
38 |
+
1,
|
39 |
+
3,
|
40 |
+
5
|
41 |
+
],
|
42 |
+
[
|
43 |
+
1,
|
44 |
+
3,
|
45 |
+
5
|
46 |
+
]
|
47 |
+
],
|
48 |
+
"upsample_rates": [
|
49 |
+
8,
|
50 |
+
8,
|
51 |
+
2,
|
52 |
+
2
|
53 |
+
],
|
54 |
+
"upsample_initial_channel": 512,
|
55 |
+
"upsample_kernel_sizes": [
|
56 |
+
16,
|
57 |
+
16,
|
58 |
+
4,
|
59 |
+
4
|
60 |
+
],
|
61 |
+
"n_layers_q": 3,
|
62 |
+
"use_spectral_norm": false,
|
63 |
+
"gin_channels": 256
|
64 |
+
},
|
65 |
+
"speakers": [
|
66 |
+
"\u7dbe\u5730\u5be7\u3005",
|
67 |
+
"\u56e0\u5e61\u3081\u3050\u308b",
|
68 |
+
"\u671d\u6b66\u82b3\u4e43",
|
69 |
+
"\u5e38\u9678\u8309\u5b50",
|
70 |
+
"\u30e0\u30e9\u30b5\u30e1",
|
71 |
+
"\u978d\u99ac\u5c0f\u6625",
|
72 |
+
"\u5728\u539f\u4e03\u6d77"
|
73 |
+
],
|
74 |
+
"symbols": [
|
75 |
+
"_",
|
76 |
+
",",
|
77 |
+
".",
|
78 |
+
"!",
|
79 |
+
"?",
|
80 |
+
"-",
|
81 |
+
"A",
|
82 |
+
"E",
|
83 |
+
"I",
|
84 |
+
"N",
|
85 |
+
"O",
|
86 |
+
"Q",
|
87 |
+
"U",
|
88 |
+
"a",
|
89 |
+
"b",
|
90 |
+
"d",
|
91 |
+
"e",
|
92 |
+
"f",
|
93 |
+
"g",
|
94 |
+
"h",
|
95 |
+
"i",
|
96 |
+
"j",
|
97 |
+
"k",
|
98 |
+
"m",
|
99 |
+
"n",
|
100 |
+
"o",
|
101 |
+
"p",
|
102 |
+
"r",
|
103 |
+
"s",
|
104 |
+
"t",
|
105 |
+
"u",
|
106 |
+
"v",
|
107 |
+
"w",
|
108 |
+
"y",
|
109 |
+
"z",
|
110 |
+
"\u0283",
|
111 |
+
"\u02a7",
|
112 |
+
"\u2193",
|
113 |
+
"\u2191",
|
114 |
+
" "
|
115 |
+
]
|
116 |
+
}
|
pretrained_models/moe-tts/0/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/0/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:17a70ab64709e25401441bc54b01bfe10370f2f7f7916a243c86fa87a6cdb9f5
|
3 |
+
size 476620221
|
pretrained_models/moe-tts/1/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["japanese_cleaners2"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 8
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u548c\u6cc9\u5983\u611b", "\u5e38\u76e4\u83ef\u4e43", "\u9326\u3042\u3059\u307f", "\u938c\u5009\u8a69\u685c", "\u7adc\u9591\u5929\u68a8", "\u548c\u6cc9\u91cc", "\u65b0\u5ddd\u5e83\u5922", "\u8056\u8389\u3005\u5b50"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
|
35 |
+
}
|
pretrained_models/moe-tts/1/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/1/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:73af1a9812c8edb038bad97b30feddb34a6e3834e1a86181873e02dd916b7f81
|
3 |
+
size 158884173
|
pretrained_models/moe-tts/10/config.json
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 10000,
|
7 |
+
"learning_rate": 2e-4,
|
8 |
+
"betas": [0.8, 0.99],
|
9 |
+
"eps": 1e-9,
|
10 |
+
"batch_size": 16,
|
11 |
+
"fp16_run": true,
|
12 |
+
"lr_decay": 0.999875,
|
13 |
+
"segment_size": 8192,
|
14 |
+
"init_lr_ratio": 1,
|
15 |
+
"warmup_epochs": 0,
|
16 |
+
"c_mel": 45,
|
17 |
+
"c_kl": 1.0
|
18 |
+
},
|
19 |
+
"data": {
|
20 |
+
"text_cleaners":[],
|
21 |
+
"max_wav_value": 32768.0,
|
22 |
+
"sampling_rate": 22050,
|
23 |
+
"filter_length": 1024,
|
24 |
+
"hop_length": 256,
|
25 |
+
"win_length": 1024,
|
26 |
+
"n_mel_channels": 80,
|
27 |
+
"mel_fmin": 0.0,
|
28 |
+
"mel_fmax": null,
|
29 |
+
"add_blank": true,
|
30 |
+
"n_speakers": 4
|
31 |
+
},
|
32 |
+
"model": {
|
33 |
+
"inter_channels": 192,
|
34 |
+
"hidden_channels": 256,
|
35 |
+
"filter_channels": 768,
|
36 |
+
"n_heads": 2,
|
37 |
+
"n_layers": 6,
|
38 |
+
"kernel_size": 3,
|
39 |
+
"p_dropout": 0.1,
|
40 |
+
"resblock": "1",
|
41 |
+
"resblock_kernel_sizes": [3,7,11],
|
42 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
43 |
+
"upsample_rates": [8,8,2,2],
|
44 |
+
"upsample_initial_channel": 512,
|
45 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
46 |
+
"n_layers_q": 3,
|
47 |
+
"use_spectral_norm": false,
|
48 |
+
"gin_channels": 256
|
49 |
+
},
|
50 |
+
"speakers": ["δΈζΉιθ‘","δΈζ‘ε½ιΊ»","εΎ‘εηΎη΄","η½δΊι»ε"],
|
51 |
+
"symbols":[]
|
52 |
+
}
|
pretrained_models/moe-tts/10/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/10/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d7d3dc42ad38c3479b41c1060c442ba33018069be637e664fefafb4bb4ad764
|
3 |
+
size 220972879
|
pretrained_models/moe-tts/11/config.json
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 20000,
|
7 |
+
"learning_rate": 2e-4,
|
8 |
+
"betas": [0.8, 0.99],
|
9 |
+
"eps": 1e-9,
|
10 |
+
"batch_size": 16,
|
11 |
+
"fp16_run": true,
|
12 |
+
"lr_decay": 0.999875,
|
13 |
+
"segment_size": 8192,
|
14 |
+
"init_lr_ratio": 1,
|
15 |
+
"warmup_epochs": 0,
|
16 |
+
"c_mel": 45,
|
17 |
+
"c_kl": 1.0
|
18 |
+
},
|
19 |
+
"data": {
|
20 |
+
"text_cleaners":[],
|
21 |
+
"max_wav_value": 32768.0,
|
22 |
+
"sampling_rate": 22050,
|
23 |
+
"filter_length": 1024,
|
24 |
+
"hop_length": 256,
|
25 |
+
"win_length": 1024,
|
26 |
+
"n_mel_channels": 80,
|
27 |
+
"mel_fmin": 0.0,
|
28 |
+
"mel_fmax": null,
|
29 |
+
"add_blank": true,
|
30 |
+
"n_speakers": 1,
|
31 |
+
"cleaned_text": true
|
32 |
+
},
|
33 |
+
"model": {
|
34 |
+
"inter_channels": 192,
|
35 |
+
"hidden_channels": 256,
|
36 |
+
"filter_channels": 768,
|
37 |
+
"n_heads": 2,
|
38 |
+
"n_layers": 6,
|
39 |
+
"kernel_size": 3,
|
40 |
+
"p_dropout": 0.1,
|
41 |
+
"resblock": "1",
|
42 |
+
"resblock_kernel_sizes": [3,7,11],
|
43 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
44 |
+
"upsample_rates": [8,8,2,2],
|
45 |
+
"upsample_initial_channel": 512,
|
46 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
47 |
+
"n_layers_q": 3,
|
48 |
+
"use_spectral_norm": false
|
49 |
+
},
|
50 |
+
"speakers": ["εε£γγγ‘"],
|
51 |
+
"symbols":[]
|
52 |
+
}
|
pretrained_models/moe-tts/11/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/11/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56d55e4672c5f335ebae30728529e5efb8a9c3975a9b63e6590454ef8769ae70
|
3 |
+
size 203264375
|
pretrained_models/moe-tts/12/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["japanese_cleaners2"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 12
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u77e2\u6765\u7f8e\u7fbd", "\u5e03\u826f\u6893", "\u30a8\u30ea\u30ca", "\u7a32\u6751\u8389\u97f3", "\u30cb\u30b3\u30e9", "\u8352\u795e\u5c0f\u591c", "\u5927\u623f\u3072\u3088\u91cc", "\u6de1\u8def\u840c\u9999", "\u30a2\u30f3\u30ca", "\u5009\u7aef\u76f4\u592a", "\u67a1\u5f62\u5175\u99ac", "\u6247\u5143\u6a39"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
|
35 |
+
}
|
pretrained_models/moe-tts/12/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/12/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf8761f1f7818c961651d2c0d914821f742a9a1df8841aae376c888289ae5609
|
3 |
+
size 158888269
|
pretrained_models/moe-tts/13/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["japanese_cleaners2"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 29
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u91d1\u8272\u306e\u95c7", "\u30e2\u30e2", "\u30ca\u30ca", "\u7d50\u57ce\u7f8e\u67d1", "\u53e4\u624b\u5ddd\u552f", "\u9ed2\u54b2\u82bd\u4e9c", "\u30cd\u30e1\u30b7\u30b9", "\u6751\u96e8\u9759", "\u30bb\u30ea\u30fc\u30cc", "\u30e9\u30e9", "\u5929\u6761\u9662\u6c99\u59eb", "\u897f\u9023\u5bfa\u6625\u83dc", "\u30eb\u30f3", "\u30e1\u30a4", "\u9727\u5d0e\u606d\u5b50", "\u7c7e\u5ca1\u91cc\u7d17", "\u6ca2\u7530\u672a\u592e", "\u30c6\u30a3\u30a2\u30fc\u30e6", "\u4e5d\u6761\u51db", "\u85e4\u5d0e\u7dbe", "\u7d50\u57ce\u83ef", "\u5fa1\u9580\u6dbc\u5b50", "\u30a2\u30bc\u30f3\u30c0", "\u5915\u5d0e\u68a8\u5b50", "\u7d50\u57ce\u68a8\u6597", "\u30da\u30b1", "\u733f\u5c71\u30b1\u30f3\u30a4\u30c1", "\u30ec\u30f3", "\u6821\u9577"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "#", "@", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
|
35 |
+
}
|
pretrained_models/moe-tts/13/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/13/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e07fd627c9ad01002c889ddda9b8a9b0da9ab942115b50d44227ded7ca87ad4
|
3 |
+
size 158907213
|
pretrained_models/moe-tts/14/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["cjks_cleaners"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 24
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u7dbe\u5730\u5be7\u3005", "\u671d\u6b66\u82b3\u4e43", "\u5728\u539f\u4e03\u6d77", "\u30eb\u30a4\u30ba", "\u91d1\u8272\u306e\u95c7", "\u30e2\u30e2", "\u7d50\u57ce\u7f8e\u67d1", "\u5c0f\u8338", "\u5510\u4e50\u541f", "\u5c0f\u6bb7", "\u82b1\u73b2", "\u516b\u56db", "\uc218\uc544", "\ubbf8\ubbf8\ub974", "\uc544\ub9b0", "\uc720\ud654", "\uc5f0\ud654", "SA1", "SA2", "SA3", "SA4", "SA5", "SA6"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0283", "\u02a7", "\u02a5", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u00e7", "\u0278", "\u027e", "\u03b2", "\u014b", "\u0266", "\u02d0", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u2192", "\u2193", "\u2191", " "]
|
35 |
+
}
|
pretrained_models/moe-tts/14/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2801051beb8f90bd9785604fad617bf95a8f05df93722ad8993128dd6bf91301
|
3 |
+
size 158912845
|
pretrained_models/moe-tts/15/config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pretrained_models/moe-tts/15/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f021227e3d2e282ec5756e9704dcb2a28831c3b9ae527d639a2ca9b493e0636
|
3 |
+
size 161855565
|
pretrained_models/moe-tts/16/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["shanghainese_cleaners"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 2
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u4e0a\u6d77\u8bdd","None"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "\u2026", "a", "b", "d", "f", "g", "h", "i", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "y", "z", "\u00f8", "\u014b", "\u0235", "\u0251", "\u0254", "\u0255", "\u0259", "\u0264", "\u0266", "\u026a", "\u027f", "\u0291", "\u0294", "\u02b0", "\u0303", "\u0329", "\u1d00", "\u1d07", "1", "5", "6", "7", "8", " "]
|
35 |
+
}
|
pretrained_models/moe-tts/16/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:750299355c3cd6bec4bca61ac50dbfb4c1e129be9b0806442cee24071bed657b
|
3 |
+
size 158882637
|
pretrained_models/moe-tts/17/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["chinese_dialect_cleaners"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 50
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u5e03\u826f\u6893", "\u7dbe\u5730\u5be7\u3005", "\u671d\u6b66\u82b3\u4e43", "\u5728\u539f\u4e03\u6d77", "\u30e6\u30fc\u30b9\u30c6\u30a3\u30a2", "\u30b3\u30ec\u30c3\u30c8", "\u30ea\u30b7\u30a2", "\u30ab\u30a4\u30e0", "\u30eb\u30a4\u30ba", "\u3064\u304f\u3088\u307f\u3061\u3083\u3093", "\u83f2\u5442\u83c8", "\u8b1d\u5b50\u81e3", "\u96ea\u898b", "\u590f\u828a\u5e06", "\u7f85\u5c11\u5cf0", "\u8b1d\u5b50\u7487", "\u6960\u5e0c\u59d0", "\u8389\u8389", "\u5c0f\u8338", "\u5510\u4e50\u541f", "\u5c0f\u6bb7", "\u82b1\u73b2", "\u6d77\u8bcd\u4e0a\u6d77\u8bdd", "\u6d77\u8bcd\u5e7f\u4e1c\u8bdd"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "~", "\u2026", "\u2500", "#", "N", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e6", "\u00e7", "\u00f8", "\u014b", "\u0153", "\u0235", "\u0250", "\u0251", "\u0252", "\u0253", "\u0254", "\u0255", "\u0257", "\u0258", "\u0259", "\u025a", "\u025b", "\u025c", "\u0263", "\u0264", "\u0266", "\u026a", "\u026d", "\u026f", "\u0275", "\u0277", "\u0278", "\u027b", "\u027e", "\u027f", "\u0282", "\u0285", "\u028a", "\u028b", "\u028c", "\u028f", "\u0291", "\u0294", "\u02a6", "\u02ae", "\u02b0", "\u02b7", "\u02c0", "\u02d0", "\u02e5", "\u02e6", "\u02e7", "\u02e8", "\u02e9", "\u0303", "\u031a", "\u0325", "\u0329", "\u1d00", "\u1d07", "\u2191", "\u2193", "\u2205", "\u2c7c", " "]
|
35 |
+
}
|
pretrained_models/moe-tts/17/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5bafc0ad64442808ccbdc1c880846d4d7ed30e5db6b9c68982bade0070e135a9
|
3 |
+
size 158966349
|
pretrained_models/moe-tts/18/config.json
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 10000,
|
7 |
+
"learning_rate": 2e-4,
|
8 |
+
"betas": [0.8, 0.99],
|
9 |
+
"eps": 1e-9,
|
10 |
+
"batch_size": 1,
|
11 |
+
"fp16_run": true,
|
12 |
+
"lr_decay": 0.999875,
|
13 |
+
"segment_size": 8192,
|
14 |
+
"init_lr_ratio": 1,
|
15 |
+
"warmup_epochs": 0,
|
16 |
+
"c_mel": 45,
|
17 |
+
"c_kl": 1.0
|
18 |
+
},
|
19 |
+
"data": {
|
20 |
+
"training_files":"E:/uma_voice/output_train.txt.cleaned",
|
21 |
+
"validation_files":"E:/uma_voice/output_val.txt.cleaned",
|
22 |
+
"text_cleaners":["japanese_cleaners"],
|
23 |
+
"max_wav_value": 32768.0,
|
24 |
+
"sampling_rate": 22050,
|
25 |
+
"filter_length": 1024,
|
26 |
+
"hop_length": 256,
|
27 |
+
"win_length": 1024,
|
28 |
+
"n_mel_channels": 80,
|
29 |
+
"mel_fmin": 0.0,
|
30 |
+
"mel_fmax": null,
|
31 |
+
"add_blank": true,
|
32 |
+
"n_speakers": 87,
|
33 |
+
"cleaned_text": true
|
34 |
+
},
|
35 |
+
"model": {
|
36 |
+
"inter_channels": 192,
|
37 |
+
"hidden_channels": 192,
|
38 |
+
"filter_channels": 768,
|
39 |
+
"n_heads": 2,
|
40 |
+
"n_layers": 6,
|
41 |
+
"kernel_size": 3,
|
42 |
+
"p_dropout": 0.1,
|
43 |
+
"resblock": "1",
|
44 |
+
"resblock_kernel_sizes": [3,7,11],
|
45 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
46 |
+
"upsample_rates": [8,8,2,2],
|
47 |
+
"upsample_initial_channel": 512,
|
48 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
49 |
+
"n_layers_q": 3,
|
50 |
+
"use_spectral_norm": false,
|
51 |
+
"gin_channels": 256
|
52 |
+
},
|
53 |
+
"speakers": ["Special Week",
|
54 |
+
"Silence Suzuka",
|
55 |
+
"Tokai Teio",
|
56 |
+
"Maruzensky",
|
57 |
+
"Fuji Kiseki",
|
58 |
+
"Oguri Cap",
|
59 |
+
"Gold Ship",
|
60 |
+
"Vodka",
|
61 |
+
"Daiwa Scarlet",
|
62 |
+
"Taiki Shuttle",
|
63 |
+
"Grass Wonder",
|
64 |
+
"Hishi Amazon",
|
65 |
+
"Mejiro Mcqueen",
|
66 |
+
"El Condor Pasa",
|
67 |
+
"T.M. Opera O",
|
68 |
+
"Narita Brian",
|
69 |
+
"Symboli Rudolf",
|
70 |
+
"Air Groove",
|
71 |
+
"Agnes Digital",
|
72 |
+
"Seiun Sky",
|
73 |
+
"Tamamo Cross",
|
74 |
+
"Fine Motion",
|
75 |
+
"Biwa Hayahide",
|
76 |
+
"Mayano Topgun",
|
77 |
+
"Manhattan Cafe",
|
78 |
+
"Mihono Bourbon",
|
79 |
+
"Mejiro Ryan",
|
80 |
+
"Hishi Akebono",
|
81 |
+
"Yukino Bijin",
|
82 |
+
"Rice Shower",
|
83 |
+
"Ines Fujin",
|
84 |
+
"Agnes Tachyon",
|
85 |
+
"Admire Vega",
|
86 |
+
"Inari One",
|
87 |
+
"Winning Ticket",
|
88 |
+
"Air Shakur",
|
89 |
+
"Eishin Flash",
|
90 |
+
"Curren Chan",
|
91 |
+
"Kawakami Princess",
|
92 |
+
"Gold City",
|
93 |
+
"Sakura Bakushin O",
|
94 |
+
"Seeking the Pearl",
|
95 |
+
"Shinko Windy",
|
96 |
+
"Sweep Tosho",
|
97 |
+
"Super Creek",
|
98 |
+
"Smart Falcon",
|
99 |
+
"Zenno Rob Roy",
|
100 |
+
"Tosen Jordan",
|
101 |
+
"Nakayama Festa",
|
102 |
+
"Narita Taishin",
|
103 |
+
"Nishino Flower",
|
104 |
+
"Haru Urara",
|
105 |
+
"Bamboo Memory",
|
106 |
+
"Biko Pegasus",
|
107 |
+
"Marvelous Sunday",
|
108 |
+
"Matikane Fukukitaru",
|
109 |
+
"Mr. C.B.",
|
110 |
+
"Meisho Doto",
|
111 |
+
"Mejiro Dober",
|
112 |
+
"Nice Nature",
|
113 |
+
"King Halo",
|
114 |
+
"Matikane Tannhauser",
|
115 |
+
"Ikuno Dictus",
|
116 |
+
"Mejiro Palmer",
|
117 |
+
"Daitaku Helios",
|
118 |
+
"Twin Turbo",
|
119 |
+
"Satono Diamond",
|
120 |
+
"Kitasan Black",
|
121 |
+
"Sakura Chiyono O",
|
122 |
+
"Sirius Symboli",
|
123 |
+
"Mejiro Ardan",
|
124 |
+
"Yaeno Muteki",
|
125 |
+
"Tsurumaru Tsuyoshi",
|
126 |
+
"Mejiro Bright",
|
127 |
+
"Sakura Laurel",
|
128 |
+
"Narita Top Road",
|
129 |
+
"Yamanin Zephyr",
|
130 |
+
"Symboli Kris S",
|
131 |
+
"Tanino Gimlet",
|
132 |
+
"Daiichi Ruby",
|
133 |
+
"Aston Machan",
|
134 |
+
"Hayakawa Tazuna",
|
135 |
+
"KS Miracle",
|
136 |
+
"Kopano Rickey",
|
137 |
+
"Hoko Tarumae",
|
138 |
+
"Wonder Acute",
|
139 |
+
"President Akikawa"
|
140 |
+
],
|
141 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
|
142 |
+
}
|
pretrained_models/moe-tts/18/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/18/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a53f4eb6bf8226b3fb4a3b31436235f697692f5566039ce3491b80af9a9567a
|
3 |
+
size 158962765
|
pretrained_models/moe-tts/2/config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["japanese_cleaners"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 7
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u56db\u5b63\u30ca\u30c4\u30e1", "\u660e\u6708\u681e\u90a3", "\u58a8\u67d3\u5e0c", "\u706b\u6253\u8c37\u611b\u8863", "\u6c50\u5c71\u6dbc\u97f3", "None", "None"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "~","A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
|
35 |
+
}
|
36 |
+
|
pretrained_models/moe-tts/2/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/2/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16adcc6dd3f23ac4407176769f1e6843f86a5b16e04b8abb5a6a11132e6b9751
|
3 |
+
size 476622149
|
pretrained_models/moe-tts/3/config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["japanese_cleaners"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 5
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u6625\u65e5\u91ce\u7a79", "\u5929\u5973\u76ee\u745b", "\u4f9d\u5a9b\u5948\u7dd2", "\u6e1a\u4e00\u8449", "None"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "~","A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
|
35 |
+
}
|
36 |
+
|
pretrained_models/moe-tts/3/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/3/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60dfd6e56a1f895e3db4c054fd94d5a0362103dd5d2e19941e17dd1be41e6b11
|
3 |
+
size 476796721
|
pretrained_models/moe-tts/4/config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["japanese_cleaners"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 6
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u84ee\u83ef", "\u7bdd\u30ce\u9727\u679d", "\u6ca2\u6e21\u96eb", "\u4e9c\u7483\u5b50", "\u706f\u9732\u690e", "\u89a1\u5915\u8389"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "~","A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
|
35 |
+
}
|
36 |
+
|
pretrained_models/moe-tts/4/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/4/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae80b8e7f18766625a2fe991263c7c0d42364fa1a55d772c0c645f68c72a3750
|
3 |
+
size 476799793
|
pretrained_models/moe-tts/5/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["zh_ja_mixture_cleaners"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 5
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u7dbe\u5730\u5be7\u3005", "\u5728\u539f\u4e03\u6d77", "\u5c0f\u8338", "\u5510\u4e50\u541f"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u207c", "\u02b0", "`", "\u2192", "\u2193", "\u2191", " "]
|
35 |
+
}
|
pretrained_models/moe-tts/5/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/5/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edfb6b428c398fab83a85b5ae41e13cb5a9f7be12692129e8a880d4553701f7b
|
3 |
+
size 158888013
|
pretrained_models/moe-tts/6/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["korean_cleaners"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 6
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\uc218\uc544", "\ubbf8\ubbf8\ub974", "\uc544\ub9b0", "\uc5f0\ud654", "\uc720\ud654", "\uc120\ubc30"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "\u2026", "~", "\u3131", "\u3134", "\u3137", "\u3139", "\u3141", "\u3142", "\u3145", "\u3147", "\u3148", "\u314a", "\u314b", "\u314c", "\u314d", "\u314e", "\u3132", "\u3138", "\u3143", "\u3146", "\u3149", "\u314f", "\u3153", "\u3157", "\u315c", "\u3161", "\u3163", "\u3150", "\u3154", " "]
|
35 |
+
}
|
pretrained_models/moe-tts/6/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/6/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5461551d900d726e24fe5551c3773c0c27419c9237882fe7d400025344499f85
|
3 |
+
size 158875981
|
pretrained_models/moe-tts/7/config.json
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 10,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 10000,
|
7 |
+
"learning_rate": 2e-4,
|
8 |
+
"betas": [0.8, 0.99],
|
9 |
+
"eps": 1e-9,
|
10 |
+
"batch_size": 50,
|
11 |
+
"fp16_run": true,
|
12 |
+
"lr_decay": 0.999875,
|
13 |
+
"segment_size": 8192,
|
14 |
+
"init_lr_ratio": 1,
|
15 |
+
"warmup_epochs": 0,
|
16 |
+
"c_mel": 45,
|
17 |
+
"c_kl": 1.0
|
18 |
+
},
|
19 |
+
"data": {
|
20 |
+
"training_files":"filelists/multi_speaker.csv",
|
21 |
+
"validation_files":"filelists/multi_speaker_val.csv",
|
22 |
+
"text_cleaners":["japanese_cleaners"],
|
23 |
+
"max_wav_value": 32768.0,
|
24 |
+
"sampling_rate": 22050,
|
25 |
+
"filter_length": 1024,
|
26 |
+
"hop_length": 256,
|
27 |
+
"win_length": 1024,
|
28 |
+
"n_mel_channels": 80,
|
29 |
+
"mel_fmin": 0.0,
|
30 |
+
"mel_fmax": null,
|
31 |
+
"add_blank": true,
|
32 |
+
"n_speakers": 13,
|
33 |
+
"cleaned_text": true
|
34 |
+
},
|
35 |
+
"model": {
|
36 |
+
"inter_channels": 192,
|
37 |
+
"hidden_channels": 192,
|
38 |
+
"filter_channels": 768,
|
39 |
+
"n_heads": 2,
|
40 |
+
"n_layers": 6,
|
41 |
+
"kernel_size": 3,
|
42 |
+
"p_dropout": 0.1,
|
43 |
+
"resblock": "1",
|
44 |
+
"resblock_kernel_sizes": [3,7,11],
|
45 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
46 |
+
"upsample_rates": [8,8,2,2],
|
47 |
+
"upsample_initial_channel": 512,
|
48 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
49 |
+
"n_layers_q": 3,
|
50 |
+
"use_spectral_norm": false,
|
51 |
+
"gin_channels": 256
|
52 |
+
},
|
53 |
+
"speakers": ["ι·Ήεζη","ι·Ήεζι΄","γ’γγ€γͺγ’", "εη§ζζ₯ι¦","ATRI", "γ’γ€γ©", "ζ°ε 彩ι³", "姫ιζε₯", "ε°ι γγ", "θ代ζ©ζ°·ηΉ", "ζεηη½", "η½ε²ηΎη΅΅η ", "δΊιε ηη΄
"],
|
54 |
+
"symbols":["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
|
55 |
+
}
|
pretrained_models/moe-tts/7/cover.jpg
ADDED
Git LFS Details
|
pretrained_models/moe-tts/7/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f96e046a777407883d4665777118bdfbe0a48fc18c5fdea16c1d05eaa3af7773
|
3 |
+
size 476818993
|
pretrained_models/moe-tts/8/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"segment_size": 8192
|
4 |
+
},
|
5 |
+
"data": {
|
6 |
+
"text_cleaners":["japanese_cleaners2"],
|
7 |
+
"max_wav_value": 32768.0,
|
8 |
+
"sampling_rate": 22050,
|
9 |
+
"filter_length": 1024,
|
10 |
+
"hop_length": 256,
|
11 |
+
"win_length": 1024,
|
12 |
+
"add_blank": true,
|
13 |
+
"n_speakers": 26
|
14 |
+
},
|
15 |
+
"model": {
|
16 |
+
"inter_channels": 192,
|
17 |
+
"hidden_channels": 192,
|
18 |
+
"filter_channels": 768,
|
19 |
+
"n_heads": 2,
|
20 |
+
"n_layers": 6,
|
21 |
+
"kernel_size": 3,
|
22 |
+
"p_dropout": 0.1,
|
23 |
+
"resblock": "1",
|
24 |
+
"resblock_kernel_sizes": [3,7,11],
|
25 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
26 |
+
"upsample_rates": [8,8,2,2],
|
27 |
+
"upsample_initial_channel": 512,
|
28 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
29 |
+
"n_layers_q": 3,
|
30 |
+
"use_spectral_norm": false,
|
31 |
+
"gin_channels": 256
|
32 |
+
},
|
33 |
+
"speakers": ["\u30eb\u30a4\u30ba", "\u30c6\u30a3\u30d5\u30a1\u30cb\u30a2", "\u30a4\u30eb\u30af\u30af\u30a5", "\u30a2\u30f3\u30ea\u30a8\u30c3\u30bf", "\u30bf\u30d0\u30b5", "\u30b7\u30a8\u30b9\u30bf", "\u30cf\u30eb\u30ca", "\u5c11\u5973\u30ea\u30b7\u30e5", "\u30ea\u30b7\u30e5", "\u30a2\u30ad\u30ca", "\u30af\u30ea\u30b9", "\u30ab\u30c8\u30ec\u30a2", "\u30a8\u30ec\u30aa\u30ce\u30fc\u30eb", "\u30e2\u30f3\u30e2\u30e9\u30f3\u30b7\u30fc", "\u30ea\u30fc\u30f4\u30eb", "\u30ad\u30e5\u30eb\u30b1", "\u30a6\u30a7\u30b6\u30ea\u30fc", "\u30b5\u30a4\u30c8", "\u30ae\u30fc\u30b7\u30e5", "\u30b3\u30eb\u30d9\u30fc\u30eb", "\u30aa\u30b9\u30de\u30f3", "\u30c7\u30eb\u30d5\u30ea\u30f3\u30ac\u30fc", "\u30c6\u30af\u30b9\u30c8", "\u30c0\u30f3\u30d7\u30ea\u30e1", "\u30ac\u30ec\u30c3\u30c8", "\u30b9\u30ab\u30ed\u30f3"],
|
34 |
+
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
|
35 |
+
}
|