RyaoChengfeng commited on
Commit
79dd817
Β·
1 Parent(s): b5b5597
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -1
  2. .gitignore +1 -1
  3. pretrained_models/moe-tts/0/config.json +116 -0
  4. pretrained_models/moe-tts/0/cover.jpg +3 -0
  5. pretrained_models/moe-tts/0/model.pth +3 -0
  6. pretrained_models/moe-tts/1/config.json +35 -0
  7. pretrained_models/moe-tts/1/cover.jpg +3 -0
  8. pretrained_models/moe-tts/1/model.pth +3 -0
  9. pretrained_models/moe-tts/10/config.json +52 -0
  10. pretrained_models/moe-tts/10/cover.jpg +3 -0
  11. pretrained_models/moe-tts/10/model.pth +3 -0
  12. pretrained_models/moe-tts/11/config.json +52 -0
  13. pretrained_models/moe-tts/11/cover.jpg +3 -0
  14. pretrained_models/moe-tts/11/model.pth +3 -0
  15. pretrained_models/moe-tts/12/config.json +35 -0
  16. pretrained_models/moe-tts/12/cover.jpg +3 -0
  17. pretrained_models/moe-tts/12/model.pth +3 -0
  18. pretrained_models/moe-tts/13/config.json +35 -0
  19. pretrained_models/moe-tts/13/cover.jpg +3 -0
  20. pretrained_models/moe-tts/13/model.pth +3 -0
  21. pretrained_models/moe-tts/14/config.json +35 -0
  22. pretrained_models/moe-tts/14/model.pth +3 -0
  23. pretrained_models/moe-tts/15/config.json +0 -0
  24. pretrained_models/moe-tts/15/model.pth +3 -0
  25. pretrained_models/moe-tts/16/config.json +35 -0
  26. pretrained_models/moe-tts/16/model.pth +3 -0
  27. pretrained_models/moe-tts/17/config.json +35 -0
  28. pretrained_models/moe-tts/17/model.pth +3 -0
  29. pretrained_models/moe-tts/18/config.json +142 -0
  30. pretrained_models/moe-tts/18/cover.jpg +3 -0
  31. pretrained_models/moe-tts/18/model.pth +3 -0
  32. pretrained_models/moe-tts/2/config.json +36 -0
  33. pretrained_models/moe-tts/2/cover.jpg +3 -0
  34. pretrained_models/moe-tts/2/model.pth +3 -0
  35. pretrained_models/moe-tts/3/config.json +36 -0
  36. pretrained_models/moe-tts/3/cover.jpg +3 -0
  37. pretrained_models/moe-tts/3/model.pth +3 -0
  38. pretrained_models/moe-tts/4/config.json +36 -0
  39. pretrained_models/moe-tts/4/cover.jpg +3 -0
  40. pretrained_models/moe-tts/4/model.pth +3 -0
  41. pretrained_models/moe-tts/5/config.json +35 -0
  42. pretrained_models/moe-tts/5/cover.jpg +3 -0
  43. pretrained_models/moe-tts/5/model.pth +3 -0
  44. pretrained_models/moe-tts/6/config.json +35 -0
  45. pretrained_models/moe-tts/6/cover.jpg +3 -0
  46. pretrained_models/moe-tts/6/model.pth +3 -0
  47. pretrained_models/moe-tts/7/config.json +55 -0
  48. pretrained_models/moe-tts/7/cover.jpg +3 -0
  49. pretrained_models/moe-tts/7/model.pth +3 -0
  50. pretrained_models/moe-tts/8/config.json +35 -0
.gitattributes CHANGED
@@ -23,7 +23,7 @@
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
@@ -31,3 +31,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.xz filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
 
 
 
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ pretrained_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
31
  *.xz filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *.png filter=lfs diff=lfs merge=lfs -text
35
+ *.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -548,4 +548,4 @@ cython_debug/
548
  #.idea/
549
 
550
  # models
551
- /pretrained_models/*
 
548
  #.idea/
549
 
550
  # models
551
+ #/pretrained_models/*
pretrained_models/moe-tts/0/config.json ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners": [
7
+ "japanese_cleaners"
8
+ ],
9
+ "max_wav_value": 32768.0,
10
+ "sampling_rate": 22050,
11
+ "filter_length": 1024,
12
+ "hop_length": 256,
13
+ "win_length": 1024,
14
+ "add_blank": true,
15
+ "n_speakers": 7
16
+ },
17
+ "model": {
18
+ "inter_channels": 192,
19
+ "hidden_channels": 192,
20
+ "filter_channels": 768,
21
+ "n_heads": 2,
22
+ "n_layers": 6,
23
+ "kernel_size": 3,
24
+ "p_dropout": 0.1,
25
+ "resblock": "1",
26
+ "resblock_kernel_sizes": [
27
+ 3,
28
+ 7,
29
+ 11
30
+ ],
31
+ "resblock_dilation_sizes": [
32
+ [
33
+ 1,
34
+ 3,
35
+ 5
36
+ ],
37
+ [
38
+ 1,
39
+ 3,
40
+ 5
41
+ ],
42
+ [
43
+ 1,
44
+ 3,
45
+ 5
46
+ ]
47
+ ],
48
+ "upsample_rates": [
49
+ 8,
50
+ 8,
51
+ 2,
52
+ 2
53
+ ],
54
+ "upsample_initial_channel": 512,
55
+ "upsample_kernel_sizes": [
56
+ 16,
57
+ 16,
58
+ 4,
59
+ 4
60
+ ],
61
+ "n_layers_q": 3,
62
+ "use_spectral_norm": false,
63
+ "gin_channels": 256
64
+ },
65
+ "speakers": [
66
+ "\u7dbe\u5730\u5be7\u3005",
67
+ "\u56e0\u5e61\u3081\u3050\u308b",
68
+ "\u671d\u6b66\u82b3\u4e43",
69
+ "\u5e38\u9678\u8309\u5b50",
70
+ "\u30e0\u30e9\u30b5\u30e1",
71
+ "\u978d\u99ac\u5c0f\u6625",
72
+ "\u5728\u539f\u4e03\u6d77"
73
+ ],
74
+ "symbols": [
75
+ "_",
76
+ ",",
77
+ ".",
78
+ "!",
79
+ "?",
80
+ "-",
81
+ "A",
82
+ "E",
83
+ "I",
84
+ "N",
85
+ "O",
86
+ "Q",
87
+ "U",
88
+ "a",
89
+ "b",
90
+ "d",
91
+ "e",
92
+ "f",
93
+ "g",
94
+ "h",
95
+ "i",
96
+ "j",
97
+ "k",
98
+ "m",
99
+ "n",
100
+ "o",
101
+ "p",
102
+ "r",
103
+ "s",
104
+ "t",
105
+ "u",
106
+ "v",
107
+ "w",
108
+ "y",
109
+ "z",
110
+ "\u0283",
111
+ "\u02a7",
112
+ "\u2193",
113
+ "\u2191",
114
+ " "
115
+ ]
116
+ }
pretrained_models/moe-tts/0/cover.jpg ADDED

Git LFS Details

  • SHA256: 2d443da7d7eb5c5b054077ece85b68b2b94bf5db2b51001fe32404deea7f0717
  • Pointer size: 130 Bytes
  • Size of remote file: 39.9 kB
pretrained_models/moe-tts/0/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a70ab64709e25401441bc54b01bfe10370f2f7f7916a243c86fa87a6cdb9f5
3
+ size 476620221
pretrained_models/moe-tts/1/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["japanese_cleaners2"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 8
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u548c\u6cc9\u5983\u611b", "\u5e38\u76e4\u83ef\u4e43", "\u9326\u3042\u3059\u307f", "\u938c\u5009\u8a69\u685c", "\u7adc\u9591\u5929\u68a8", "\u548c\u6cc9\u91cc", "\u65b0\u5ddd\u5e83\u5922", "\u8056\u8389\u3005\u5b50"],
34
+ "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
35
+ }
pretrained_models/moe-tts/1/cover.jpg ADDED

Git LFS Details

  • SHA256: 0123d1fa78031a85890869891b843b2f079c66fed12cf510cb6025e2e4db04c3
  • Pointer size: 130 Bytes
  • Size of remote file: 50.3 kB
pretrained_models/moe-tts/1/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73af1a9812c8edb038bad97b30feddb34a6e3834e1a86181873e02dd916b7f81
3
+ size 158884173
pretrained_models/moe-tts/10/config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 16,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "text_cleaners":[],
21
+ "max_wav_value": 32768.0,
22
+ "sampling_rate": 22050,
23
+ "filter_length": 1024,
24
+ "hop_length": 256,
25
+ "win_length": 1024,
26
+ "n_mel_channels": 80,
27
+ "mel_fmin": 0.0,
28
+ "mel_fmax": null,
29
+ "add_blank": true,
30
+ "n_speakers": 4
31
+ },
32
+ "model": {
33
+ "inter_channels": 192,
34
+ "hidden_channels": 256,
35
+ "filter_channels": 768,
36
+ "n_heads": 2,
37
+ "n_layers": 6,
38
+ "kernel_size": 3,
39
+ "p_dropout": 0.1,
40
+ "resblock": "1",
41
+ "resblock_kernel_sizes": [3,7,11],
42
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
43
+ "upsample_rates": [8,8,2,2],
44
+ "upsample_initial_channel": 512,
45
+ "upsample_kernel_sizes": [16,16,4,4],
46
+ "n_layers_q": 3,
47
+ "use_spectral_norm": false,
48
+ "gin_channels": 256
49
+ },
50
+ "speakers": ["δΈ€ζ–Ήι€šθ‘Œ","δΈŠζ‘ε½“ιΊ»","εΎ‘ε‚ηΎŽη΄","白井黒子"],
51
+ "symbols":[]
52
+ }
pretrained_models/moe-tts/10/cover.jpg ADDED

Git LFS Details

  • SHA256: cb5d83e14c8cd74a20185d8b9535f9a1699a15057f7ebce87a32f32f5aad94ba
  • Pointer size: 131 Bytes
  • Size of remote file: 104 kB
pretrained_models/moe-tts/10/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d7d3dc42ad38c3479b41c1060c442ba33018069be637e664fefafb4bb4ad764
3
+ size 220972879
pretrained_models/moe-tts/11/config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 20000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 16,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "text_cleaners":[],
21
+ "max_wav_value": 32768.0,
22
+ "sampling_rate": 22050,
23
+ "filter_length": 1024,
24
+ "hop_length": 256,
25
+ "win_length": 1024,
26
+ "n_mel_channels": 80,
27
+ "mel_fmin": 0.0,
28
+ "mel_fmax": null,
29
+ "add_blank": true,
30
+ "n_speakers": 1,
31
+ "cleaned_text": true
32
+ },
33
+ "model": {
34
+ "inter_channels": 192,
35
+ "hidden_channels": 256,
36
+ "filter_channels": 768,
37
+ "n_heads": 2,
38
+ "n_layers": 6,
39
+ "kernel_size": 3,
40
+ "p_dropout": 0.1,
41
+ "resblock": "1",
42
+ "resblock_kernel_sizes": [3,7,11],
43
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
44
+ "upsample_rates": [8,8,2,2],
45
+ "upsample_initial_channel": 512,
46
+ "upsample_kernel_sizes": [16,16,4,4],
47
+ "n_layers_q": 3,
48
+ "use_spectral_norm": false
49
+ },
50
+ "speakers": ["ε››ε­£γƒŠγƒ„γƒ‘"],
51
+ "symbols":[]
52
+ }
pretrained_models/moe-tts/11/cover.jpg ADDED

Git LFS Details

  • SHA256: 5ce5e75924dca82bb7cddbe9715f1254fe7aa0fc068085f72ff893c9324c586e
  • Pointer size: 130 Bytes
  • Size of remote file: 30.2 kB
pretrained_models/moe-tts/11/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56d55e4672c5f335ebae30728529e5efb8a9c3975a9b63e6590454ef8769ae70
3
+ size 203264375
pretrained_models/moe-tts/12/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["japanese_cleaners2"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 12
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u77e2\u6765\u7f8e\u7fbd", "\u5e03\u826f\u6893", "\u30a8\u30ea\u30ca", "\u7a32\u6751\u8389\u97f3", "\u30cb\u30b3\u30e9", "\u8352\u795e\u5c0f\u591c", "\u5927\u623f\u3072\u3088\u91cc", "\u6de1\u8def\u840c\u9999", "\u30a2\u30f3\u30ca", "\u5009\u7aef\u76f4\u592a", "\u67a1\u5f62\u5175\u99ac", "\u6247\u5143\u6a39"],
34
+ "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
35
+ }
pretrained_models/moe-tts/12/cover.jpg ADDED

Git LFS Details

  • SHA256: a4f93df7045805bcb028b92f464710e10961bae3ce43cddf2c289212673312e2
  • Pointer size: 130 Bytes
  • Size of remote file: 41 kB
pretrained_models/moe-tts/12/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf8761f1f7818c961651d2c0d914821f742a9a1df8841aae376c888289ae5609
3
+ size 158888269
pretrained_models/moe-tts/13/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["japanese_cleaners2"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 29
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u91d1\u8272\u306e\u95c7", "\u30e2\u30e2", "\u30ca\u30ca", "\u7d50\u57ce\u7f8e\u67d1", "\u53e4\u624b\u5ddd\u552f", "\u9ed2\u54b2\u82bd\u4e9c", "\u30cd\u30e1\u30b7\u30b9", "\u6751\u96e8\u9759", "\u30bb\u30ea\u30fc\u30cc", "\u30e9\u30e9", "\u5929\u6761\u9662\u6c99\u59eb", "\u897f\u9023\u5bfa\u6625\u83dc", "\u30eb\u30f3", "\u30e1\u30a4", "\u9727\u5d0e\u606d\u5b50", "\u7c7e\u5ca1\u91cc\u7d17", "\u6ca2\u7530\u672a\u592e", "\u30c6\u30a3\u30a2\u30fc\u30e6", "\u4e5d\u6761\u51db", "\u85e4\u5d0e\u7dbe", "\u7d50\u57ce\u83ef", "\u5fa1\u9580\u6dbc\u5b50", "\u30a2\u30bc\u30f3\u30c0", "\u5915\u5d0e\u68a8\u5b50", "\u7d50\u57ce\u68a8\u6597", "\u30da\u30b1", "\u733f\u5c71\u30b1\u30f3\u30a4\u30c1", "\u30ec\u30f3", "\u6821\u9577"],
34
+ "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "#", "@", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
35
+ }
pretrained_models/moe-tts/13/cover.jpg ADDED

Git LFS Details

  • SHA256: 4135cc056f26e03ba7e505f1be9ce76c6a9595340599f3d24cc929101f84d5f8
  • Pointer size: 130 Bytes
  • Size of remote file: 19.8 kB
pretrained_models/moe-tts/13/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e07fd627c9ad01002c889ddda9b8a9b0da9ab942115b50d44227ded7ca87ad4
3
+ size 158907213
pretrained_models/moe-tts/14/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["cjks_cleaners"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 24
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u7dbe\u5730\u5be7\u3005", "\u671d\u6b66\u82b3\u4e43", "\u5728\u539f\u4e03\u6d77", "\u30eb\u30a4\u30ba", "\u91d1\u8272\u306e\u95c7", "\u30e2\u30e2", "\u7d50\u57ce\u7f8e\u67d1", "\u5c0f\u8338", "\u5510\u4e50\u541f", "\u5c0f\u6bb7", "\u82b1\u73b2", "\u516b\u56db", "\uc218\uc544", "\ubbf8\ubbf8\ub974", "\uc544\ub9b0", "\uc720\ud654", "\uc5f0\ud654", "SA1", "SA2", "SA3", "SA4", "SA5", "SA6"],
34
+ "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0283", "\u02a7", "\u02a5", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u00e7", "\u0278", "\u027e", "\u03b2", "\u014b", "\u0266", "\u02d0", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u2192", "\u2193", "\u2191", " "]
35
+ }
pretrained_models/moe-tts/14/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2801051beb8f90bd9785604fad617bf95a8f05df93722ad8993128dd6bf91301
3
+ size 158912845
pretrained_models/moe-tts/15/config.json ADDED
The diff for this file is too large to render. See raw diff
 
pretrained_models/moe-tts/15/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f021227e3d2e282ec5756e9704dcb2a28831c3b9ae527d639a2ca9b493e0636
3
+ size 161855565
pretrained_models/moe-tts/16/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["shanghainese_cleaners"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 2
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u4e0a\u6d77\u8bdd","None"],
34
+ "symbols": ["_", ",", ".", "!", "?", "\u2026", "a", "b", "d", "f", "g", "h", "i", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "y", "z", "\u00f8", "\u014b", "\u0235", "\u0251", "\u0254", "\u0255", "\u0259", "\u0264", "\u0266", "\u026a", "\u027f", "\u0291", "\u0294", "\u02b0", "\u0303", "\u0329", "\u1d00", "\u1d07", "1", "5", "6", "7", "8", " "]
35
+ }
pretrained_models/moe-tts/16/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:750299355c3cd6bec4bca61ac50dbfb4c1e129be9b0806442cee24071bed657b
3
+ size 158882637
pretrained_models/moe-tts/17/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["chinese_dialect_cleaners"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 50
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u5e03\u826f\u6893", "\u7dbe\u5730\u5be7\u3005", "\u671d\u6b66\u82b3\u4e43", "\u5728\u539f\u4e03\u6d77", "\u30e6\u30fc\u30b9\u30c6\u30a3\u30a2", "\u30b3\u30ec\u30c3\u30c8", "\u30ea\u30b7\u30a2", "\u30ab\u30a4\u30e0", "\u30eb\u30a4\u30ba", "\u3064\u304f\u3088\u307f\u3061\u3083\u3093", "\u83f2\u5442\u83c8", "\u8b1d\u5b50\u81e3", "\u96ea\u898b", "\u590f\u828a\u5e06", "\u7f85\u5c11\u5cf0", "\u8b1d\u5b50\u7487", "\u6960\u5e0c\u59d0", "\u8389\u8389", "\u5c0f\u8338", "\u5510\u4e50\u541f", "\u5c0f\u6bb7", "\u82b1\u73b2", "\u6d77\u8bcd\u4e0a\u6d77\u8bdd", "\u6d77\u8bcd\u5e7f\u4e1c\u8bdd"],
34
+ "symbols": ["_", ",", ".", "!", "?", "~", "\u2026", "\u2500", "#", "N", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e6", "\u00e7", "\u00f8", "\u014b", "\u0153", "\u0235", "\u0250", "\u0251", "\u0252", "\u0253", "\u0254", "\u0255", "\u0257", "\u0258", "\u0259", "\u025a", "\u025b", "\u025c", "\u0263", "\u0264", "\u0266", "\u026a", "\u026d", "\u026f", "\u0275", "\u0277", "\u0278", "\u027b", "\u027e", "\u027f", "\u0282", "\u0285", "\u028a", "\u028b", "\u028c", "\u028f", "\u0291", "\u0294", "\u02a6", "\u02ae", "\u02b0", "\u02b7", "\u02c0", "\u02d0", "\u02e5", "\u02e6", "\u02e7", "\u02e8", "\u02e9", "\u0303", "\u031a", "\u0325", "\u0329", "\u1d00", "\u1d07", "\u2191", "\u2193", "\u2205", "\u2c7c", " "]
35
+ }
pretrained_models/moe-tts/17/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bafc0ad64442808ccbdc1c880846d4d7ed30e5db6b9c68982bade0070e135a9
3
+ size 158966349
pretrained_models/moe-tts/18/config.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 1,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "training_files":"E:/uma_voice/output_train.txt.cleaned",
21
+ "validation_files":"E:/uma_voice/output_val.txt.cleaned",
22
+ "text_cleaners":["japanese_cleaners"],
23
+ "max_wav_value": 32768.0,
24
+ "sampling_rate": 22050,
25
+ "filter_length": 1024,
26
+ "hop_length": 256,
27
+ "win_length": 1024,
28
+ "n_mel_channels": 80,
29
+ "mel_fmin": 0.0,
30
+ "mel_fmax": null,
31
+ "add_blank": true,
32
+ "n_speakers": 87,
33
+ "cleaned_text": true
34
+ },
35
+ "model": {
36
+ "inter_channels": 192,
37
+ "hidden_channels": 192,
38
+ "filter_channels": 768,
39
+ "n_heads": 2,
40
+ "n_layers": 6,
41
+ "kernel_size": 3,
42
+ "p_dropout": 0.1,
43
+ "resblock": "1",
44
+ "resblock_kernel_sizes": [3,7,11],
45
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
+ "upsample_rates": [8,8,2,2],
47
+ "upsample_initial_channel": 512,
48
+ "upsample_kernel_sizes": [16,16,4,4],
49
+ "n_layers_q": 3,
50
+ "use_spectral_norm": false,
51
+ "gin_channels": 256
52
+ },
53
+ "speakers": ["Special Week",
54
+ "Silence Suzuka",
55
+ "Tokai Teio",
56
+ "Maruzensky",
57
+ "Fuji Kiseki",
58
+ "Oguri Cap",
59
+ "Gold Ship",
60
+ "Vodka",
61
+ "Daiwa Scarlet",
62
+ "Taiki Shuttle",
63
+ "Grass Wonder",
64
+ "Hishi Amazon",
65
+ "Mejiro Mcqueen",
66
+ "El Condor Pasa",
67
+ "T.M. Opera O",
68
+ "Narita Brian",
69
+ "Symboli Rudolf",
70
+ "Air Groove",
71
+ "Agnes Digital",
72
+ "Seiun Sky",
73
+ "Tamamo Cross",
74
+ "Fine Motion",
75
+ "Biwa Hayahide",
76
+ "Mayano Topgun",
77
+ "Manhattan Cafe",
78
+ "Mihono Bourbon",
79
+ "Mejiro Ryan",
80
+ "Hishi Akebono",
81
+ "Yukino Bijin",
82
+ "Rice Shower",
83
+ "Ines Fujin",
84
+ "Agnes Tachyon",
85
+ "Admire Vega",
86
+ "Inari One",
87
+ "Winning Ticket",
88
+ "Air Shakur",
89
+ "Eishin Flash",
90
+ "Curren Chan",
91
+ "Kawakami Princess",
92
+ "Gold City",
93
+ "Sakura Bakushin O",
94
+ "Seeking the Pearl",
95
+ "Shinko Windy",
96
+ "Sweep Tosho",
97
+ "Super Creek",
98
+ "Smart Falcon",
99
+ "Zenno Rob Roy",
100
+ "Tosen Jordan",
101
+ "Nakayama Festa",
102
+ "Narita Taishin",
103
+ "Nishino Flower",
104
+ "Haru Urara",
105
+ "Bamboo Memory",
106
+ "Biko Pegasus",
107
+ "Marvelous Sunday",
108
+ "Matikane Fukukitaru",
109
+ "Mr. C.B.",
110
+ "Meisho Doto",
111
+ "Mejiro Dober",
112
+ "Nice Nature",
113
+ "King Halo",
114
+ "Matikane Tannhauser",
115
+ "Ikuno Dictus",
116
+ "Mejiro Palmer",
117
+ "Daitaku Helios",
118
+ "Twin Turbo",
119
+ "Satono Diamond",
120
+ "Kitasan Black",
121
+ "Sakura Chiyono O",
122
+ "Sirius Symboli",
123
+ "Mejiro Ardan",
124
+ "Yaeno Muteki",
125
+ "Tsurumaru Tsuyoshi",
126
+ "Mejiro Bright",
127
+ "Sakura Laurel",
128
+ "Narita Top Road",
129
+ "Yamanin Zephyr",
130
+ "Symboli Kris S",
131
+ "Tanino Gimlet",
132
+ "Daiichi Ruby",
133
+ "Aston Machan",
134
+ "Hayakawa Tazuna",
135
+ "KS Miracle",
136
+ "Kopano Rickey",
137
+ "Hoko Tarumae",
138
+ "Wonder Acute",
139
+ "President Akikawa"
140
+ ],
141
+ "symbols": ["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
142
+ }
pretrained_models/moe-tts/18/cover.jpg ADDED

Git LFS Details

  • SHA256: ea6f82b861d44d5626b7181acc8a58c9be6b6d36ddf983e89a8685c0da1f50db
  • Pointer size: 130 Bytes
  • Size of remote file: 43.5 kB
pretrained_models/moe-tts/18/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a53f4eb6bf8226b3fb4a3b31436235f697692f5566039ce3491b80af9a9567a
3
+ size 158962765
pretrained_models/moe-tts/2/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["japanese_cleaners"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 7
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u56db\u5b63\u30ca\u30c4\u30e1", "\u660e\u6708\u681e\u90a3", "\u58a8\u67d3\u5e0c", "\u706b\u6253\u8c37\u611b\u8863", "\u6c50\u5c71\u6dbc\u97f3", "None", "None"],
34
+ "symbols": ["_", ",", ".", "!", "?", "-", "~","A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
35
+ }
36
+
pretrained_models/moe-tts/2/cover.jpg ADDED

Git LFS Details

  • SHA256: cf387dd1775ebf0f98245e433686a9f8f75bcc5aa8c4ceb192b8a98d0ec42432
  • Pointer size: 130 Bytes
  • Size of remote file: 60.2 kB
pretrained_models/moe-tts/2/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16adcc6dd3f23ac4407176769f1e6843f86a5b16e04b8abb5a6a11132e6b9751
3
+ size 476622149
pretrained_models/moe-tts/3/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["japanese_cleaners"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 5
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u6625\u65e5\u91ce\u7a79", "\u5929\u5973\u76ee\u745b", "\u4f9d\u5a9b\u5948\u7dd2", "\u6e1a\u4e00\u8449", "None"],
34
+ "symbols": ["_", ",", ".", "!", "?", "-", "~","A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
35
+ }
36
+
pretrained_models/moe-tts/3/cover.jpg ADDED

Git LFS Details

  • SHA256: 1284933d68ad829768df808feaee25ad68693b8b004c44f675462750b94dd1d8
  • Pointer size: 130 Bytes
  • Size of remote file: 47.3 kB
pretrained_models/moe-tts/3/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60dfd6e56a1f895e3db4c054fd94d5a0362103dd5d2e19941e17dd1be41e6b11
3
+ size 476796721
pretrained_models/moe-tts/4/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["japanese_cleaners"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 6
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u84ee\u83ef", "\u7bdd\u30ce\u9727\u679d", "\u6ca2\u6e21\u96eb", "\u4e9c\u7483\u5b50", "\u706f\u9732\u690e", "\u89a1\u5915\u8389"],
34
+ "symbols": ["_", ",", ".", "!", "?", "-", "~","A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
35
+ }
36
+
pretrained_models/moe-tts/4/cover.jpg ADDED

Git LFS Details

  • SHA256: 4ecc5a318f4611b93bf40a584eaf5f6849d3ce812ee7ef6316f7a4a15df2c326
  • Pointer size: 131 Bytes
  • Size of remote file: 142 kB
pretrained_models/moe-tts/4/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae80b8e7f18766625a2fe991263c7c0d42364fa1a55d772c0c645f68c72a3750
3
+ size 476799793
pretrained_models/moe-tts/5/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["zh_ja_mixture_cleaners"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 5
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u7dbe\u5730\u5be7\u3005", "\u5728\u539f\u4e03\u6d77", "\u5c0f\u8338", "\u5510\u4e50\u541f"],
34
+ "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u207c", "\u02b0", "`", "\u2192", "\u2193", "\u2191", " "]
35
+ }
pretrained_models/moe-tts/5/cover.jpg ADDED

Git LFS Details

  • SHA256: dbed43668741a90c3a7faef3c3b5aace7723b94c251106fb5925a0f1ba0d7c5c
  • Pointer size: 130 Bytes
  • Size of remote file: 30.5 kB
pretrained_models/moe-tts/5/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edfb6b428c398fab83a85b5ae41e13cb5a9f7be12692129e8a880d4553701f7b
3
+ size 158888013
pretrained_models/moe-tts/6/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["korean_cleaners"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 6
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\uc218\uc544", "\ubbf8\ubbf8\ub974", "\uc544\ub9b0", "\uc5f0\ud654", "\uc720\ud654", "\uc120\ubc30"],
34
+ "symbols": ["_", ",", ".", "!", "?", "\u2026", "~", "\u3131", "\u3134", "\u3137", "\u3139", "\u3141", "\u3142", "\u3145", "\u3147", "\u3148", "\u314a", "\u314b", "\u314c", "\u314d", "\u314e", "\u3132", "\u3138", "\u3143", "\u3146", "\u3149", "\u314f", "\u3153", "\u3157", "\u315c", "\u3161", "\u3163", "\u3150", "\u3154", " "]
35
+ }
pretrained_models/moe-tts/6/cover.jpg ADDED

Git LFS Details

  • SHA256: 38e71373daa8849f04bd7867845676afab2057e69a5e0a1e312c2b6cfdd72794
  • Pointer size: 131 Bytes
  • Size of remote file: 146 kB
pretrained_models/moe-tts/6/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5461551d900d726e24fe5551c3773c0c27419c9237882fe7d400025344499f85
3
+ size 158875981
pretrained_models/moe-tts/7/config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 10,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 50,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "training_files":"filelists/multi_speaker.csv",
21
+ "validation_files":"filelists/multi_speaker_val.csv",
22
+ "text_cleaners":["japanese_cleaners"],
23
+ "max_wav_value": 32768.0,
24
+ "sampling_rate": 22050,
25
+ "filter_length": 1024,
26
+ "hop_length": 256,
27
+ "win_length": 1024,
28
+ "n_mel_channels": 80,
29
+ "mel_fmin": 0.0,
30
+ "mel_fmax": null,
31
+ "add_blank": true,
32
+ "n_speakers": 13,
33
+ "cleaned_text": true
34
+ },
35
+ "model": {
36
+ "inter_channels": 192,
37
+ "hidden_channels": 192,
38
+ "filter_channels": 768,
39
+ "n_heads": 2,
40
+ "n_layers": 6,
41
+ "kernel_size": 3,
42
+ "p_dropout": 0.1,
43
+ "resblock": "1",
44
+ "resblock_kernel_sizes": [3,7,11],
45
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
+ "upsample_rates": [8,8,2,2],
47
+ "upsample_initial_channel": 512,
48
+ "upsample_kernel_sizes": [16,16,4,4],
49
+ "n_layers_q": 3,
50
+ "use_spectral_norm": false,
51
+ "gin_channels": 256
52
+ },
53
+ "speakers": ["鷹倉杏璃","ι·Ήε€‰ζιˆ΄","γ‚’γƒšγ‚€γƒͺγ‚’", "ε€‰η§‘ζ˜Žζ—₯香","ATRI", "をむラ", "ζ–°ε ‚ε½©ιŸ³", "ε§«ι‡Žζ˜Ÿε₯", "ε°ιž γ‚†γ„", "聖代橋氷織", "ζœ‰ε‚ηœŸη™½", "η™½ε’²ηΎŽη΅΅η‘ ", "δΊŒιšŽε ‚ηœŸη΄…"],
54
+ "symbols":["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
55
+ }
pretrained_models/moe-tts/7/cover.jpg ADDED

Git LFS Details

  • SHA256: cd98e72f9a5de9df03d2cffae41f907dd70116b4ae89d9fe218df6fa45cd1767
  • Pointer size: 130 Bytes
  • Size of remote file: 98.8 kB
pretrained_models/moe-tts/7/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f96e046a777407883d4665777118bdfbe0a48fc18c5fdea16c1d05eaa3af7773
3
+ size 476818993
pretrained_models/moe-tts/8/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "segment_size": 8192
4
+ },
5
+ "data": {
6
+ "text_cleaners":["japanese_cleaners2"],
7
+ "max_wav_value": 32768.0,
8
+ "sampling_rate": 22050,
9
+ "filter_length": 1024,
10
+ "hop_length": 256,
11
+ "win_length": 1024,
12
+ "add_blank": true,
13
+ "n_speakers": 26
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "kernel_size": 3,
22
+ "p_dropout": 0.1,
23
+ "resblock": "1",
24
+ "resblock_kernel_sizes": [3,7,11],
25
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "upsample_rates": [8,8,2,2],
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [16,16,4,4],
29
+ "n_layers_q": 3,
30
+ "use_spectral_norm": false,
31
+ "gin_channels": 256
32
+ },
33
+ "speakers": ["\u30eb\u30a4\u30ba", "\u30c6\u30a3\u30d5\u30a1\u30cb\u30a2", "\u30a4\u30eb\u30af\u30af\u30a5", "\u30a2\u30f3\u30ea\u30a8\u30c3\u30bf", "\u30bf\u30d0\u30b5", "\u30b7\u30a8\u30b9\u30bf", "\u30cf\u30eb\u30ca", "\u5c11\u5973\u30ea\u30b7\u30e5", "\u30ea\u30b7\u30e5", "\u30a2\u30ad\u30ca", "\u30af\u30ea\u30b9", "\u30ab\u30c8\u30ec\u30a2", "\u30a8\u30ec\u30aa\u30ce\u30fc\u30eb", "\u30e2\u30f3\u30e2\u30e9\u30f3\u30b7\u30fc", "\u30ea\u30fc\u30f4\u30eb", "\u30ad\u30e5\u30eb\u30b1", "\u30a6\u30a7\u30b6\u30ea\u30fc", "\u30b5\u30a4\u30c8", "\u30ae\u30fc\u30b7\u30e5", "\u30b3\u30eb\u30d9\u30fc\u30eb", "\u30aa\u30b9\u30de\u30f3", "\u30c7\u30eb\u30d5\u30ea\u30f3\u30ac\u30fc", "\u30c6\u30af\u30b9\u30c8", "\u30c0\u30f3\u30d7\u30ea\u30e1", "\u30ac\u30ec\u30c3\u30c8", "\u30b9\u30ab\u30ed\u30f3"],
34
+ "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
35
+ }