Mahiruoshi commited on
Commit
dc23363
โ€ข
1 Parent(s): f6fda8c

Upload 112 files

Browse files
app.py CHANGED
@@ -216,11 +216,9 @@ WrapStyle: 0
216
  PlayResX: 640
217
  PlayResY: 360
218
  ScaledBorderAndShadow: yes
219
-
220
  [V4+ Styles]
221
  Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
222
  Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
223
-
224
  [Events]
225
  Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
226
  """
@@ -338,7 +336,7 @@ def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale
338
  if __name__ == "__main__":
339
  parser = argparse.ArgumentParser()
340
  parser.add_argument(
341
- "-m", "--model", default="./logs/BangDream/G_43000.pth", help="path of your model"
342
  )
343
  parser.add_argument(
344
  "-c",
@@ -387,7 +385,7 @@ if __name__ == "__main__":
387
  ]
388
  with gr.Blocks() as app:
389
  gr.Markdown(
390
- f"Bang Dreamๅ…จๅ‘˜TTS,ไฝฟ็”จๆœฌๆจกๅž‹่ฏทไธฅๆ ผ้ตๅฎˆๆณ•ๅพ‹ๆณ•่ง„!\n ๅ‘ๅธƒไบŒๅˆ›ไฝœๅ“่ฏทๆ ‡ๆณจๆœฌ้กน็›ฎไฝœ่€…<a href='https://space.bilibili.com/19874615/'>B็ซ™@Mahiroshi</a>ๅŠ้กน็›ฎ้“พๆŽฅ\nไปŽ <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>ๆˆ‘็š„ๅšๅฎข็ซ™็‚น</a> ๆŸฅ็œ‹ไฝฟ็”จ่ฏดๆ˜Ž</a>"
391
  )
392
  for band in BandList:
393
  with gr.TabItem(band):
@@ -444,9 +442,9 @@ if __name__ == "__main__":
444
  with gr.Row():
445
  with gr.Column():
446
  gr.Markdown(
447
- f"ไปŽ <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>ๆˆ‘็š„ๅšๅฎข็ซ™็‚น</a> ๆŸฅ็œ‹ไฝฟ็”จ่ฏดๆ˜Ž\nๆธธๆˆ่„šๆœฌ่ง<a href='https://bestdori.com/tool/explorer/asset/cn/scenario'>bestdori</a>"
448
  )
449
- inputFile = gr.inputs.File(label="ไธŠไผ ๆธธๆˆ่„šๆœฌ(ๆ—ฅๆ–‡)ใ€ไธญๆ–‡่„šๆœฌ(้œ€่ฎพ็ฝฎ่ง’่‰ฒๅฏนๅบ”ๅ…ณ็ณป)ใ€่‡ชๅˆถๆ–‡ใ€(้œ€่ฎพ็ฝฎ่ง’่‰ฒๅฏนๅบ”ๅ…ณ็ณป")
450
  groupSize = gr.Slider(
451
  minimum=10, maximum=1000,value = i[1], step=1, label="ๅฝ“ไธช้Ÿณ้ข‘ๆ–‡ไปถๅŒ…ๅซ็š„ๆœ€ๅคงๅญ—ๆ•ฐ"
452
  )
 
216
  PlayResX: 640
217
  PlayResY: 360
218
  ScaledBorderAndShadow: yes
 
219
  [V4+ Styles]
220
  Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
221
  Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
 
222
  [Events]
223
  Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
224
  """
 
336
  if __name__ == "__main__":
337
  parser = argparse.ArgumentParser()
338
  parser.add_argument(
339
+ "-m", "--model", default="./logs/BangDream/G_7000.pth", help="path of your model"
340
  )
341
  parser.add_argument(
342
  "-c",
 
385
  ]
386
  with gr.Blocks() as app:
387
  gr.Markdown(
388
+ f"ๅฐ‘ๆญŒ้‚ฆ้‚ฆๅ…จๅ‘˜TTS,ไฝฟ็”จๆœฌๆจกๅž‹่ฏทไธฅๆ ผ้ตๅฎˆๆณ•ๅพ‹ๆณ•่ง„!\n ๅ‘ๅธƒไบŒๅˆ›ไฝœๅ“่ฏทๆ ‡ๆณจๆœฌ้กน็›ฎไฝœ่€…<a href='https://space.bilibili.com/19874615/'>B็ซ™@Mahiroshi</a>ๅŠ้กน็›ฎ้“พๆŽฅ\nไปŽ <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>ๆˆ‘็š„ๅšๅฎข็ซ™็‚น</a> ๆŸฅ็œ‹ไฝฟ็”จ่ฏดๆ˜Ž</a>"
389
  )
390
  for band in BandList:
391
  with gr.TabItem(band):
 
442
  with gr.Row():
443
  with gr.Column():
444
  gr.Markdown(
445
+ f"ไปŽ <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>ๆˆ‘็š„ๅšๅฎข็ซ™็‚น</a> ๆŸฅ็œ‹่‡ชๅˆถgalgameไฝฟ็”จ่ฏดๆ˜Ž\n</a>"
446
  )
447
+ inputFile = gr.inputs.File(label="ไธŠไผ txt(ๅฏ่ฎพ็ฝฎ่ง’่‰ฒๅฏนๅบ”่กจ)ใ€epubๆˆ–mobiๆ–‡ไปถ")
448
  groupSize = gr.Slider(
449
  minimum=10, maximum=1000,value = i[1], step=1, label="ๅฝ“ไธช้Ÿณ้ข‘ๆ–‡ไปถๅŒ…ๅซ็š„ๆœ€ๅคงๅญ—ๆ•ฐ"
450
  )
bert/bert-base-japanese-v3/README.md CHANGED
@@ -50,4 +50,4 @@ The pretrained models are distributed under the Apache License 2.0.
50
 
51
  ## Acknowledgments
52
 
53
- This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
 
50
 
51
  ## Acknowledgments
52
 
53
+ This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
bert/bert-base-japanese-v3/vocab.txt CHANGED
@@ -13,7 +13,7 @@
13
  [unused7]
14
  [unused8]
15
  [unused9]
16
-
17
  !
18
  "
19
  #
 
13
  [unused7]
14
  [unused8]
15
  [unused9]
16
+
17
  !
18
  "
19
  #
bert/chinese-roberta-wwm-ext-large/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.bin
bert/chinese-roberta-wwm-ext-large/README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- language:
3
  - zh
4
  tags:
5
  - bert
@@ -9,9 +9,9 @@ license: "apache-2.0"
9
  # Please use 'Bert' related functions to load this model!
10
 
11
  ## Chinese BERT with Whole Word Masking
12
- For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
13
 
14
- **[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
15
  Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
16
 
17
  This repository is developed based on๏ผšhttps://github.com/google-research/bert
@@ -46,7 +46,7 @@ If you find the technical report or resource is useful, please cite the followin
46
  pages = "657--668",
47
  }
48
  ```
49
- - Secondary: https://arxiv.org/abs/1906.08101
50
  ```
51
  @article{chinese-bert-wwm,
52
  title={Pre-Training with Whole Word Masking for Chinese BERT},
@@ -54,4 +54,4 @@ If you find the technical report or resource is useful, please cite the followin
54
  journal={arXiv preprint arXiv:1906.08101},
55
  year={2019}
56
  }
57
- ```
 
1
  ---
2
+ language:
3
  - zh
4
  tags:
5
  - bert
 
9
  # Please use 'Bert' related functions to load this model!
10
 
11
  ## Chinese BERT with Whole Word Masking
12
+ For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
13
 
14
+ **[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
15
  Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
16
 
17
  This repository is developed based on๏ผšhttps://github.com/google-research/bert
 
46
  pages = "657--668",
47
  }
48
  ```
49
+ - Secondary: https://arxiv.org/abs/1906.08101
50
  ```
51
  @article{chinese-bert-wwm,
52
  title={Pre-Training with Whole Word Masking for Chinese BERT},
 
54
  journal={arXiv preprint arXiv:1906.08101},
55
  year={2019}
56
  }
57
+ ```
bert/chinese-roberta-wwm-ext-large/added_tokens.json CHANGED
@@ -1 +1 @@
1
- {}
 
1
+ {}
bert/chinese-roberta-wwm-ext-large/special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
bert/chinese-roberta-wwm-ext-large/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
bert/chinese-roberta-wwm-ext-large/tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"init_inputs": []}
 
1
+ {"init_inputs": []}
bert_gen.py CHANGED
@@ -21,13 +21,12 @@ def process_line(line):
21
  word2ph = [i for i in word2ph]
22
  phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
23
 
24
- if hps.data.add_blank:
25
- phone = commons.intersperse(phone, 0)
26
- tone = commons.intersperse(tone, 0)
27
- language = commons.intersperse(language, 0)
28
- for i in range(len(word2ph)):
29
- word2ph[i] = word2ph[i] * 2
30
- word2ph[0] += 1
31
 
32
  bert_path = wav_path.replace(".wav", ".bert.pt")
33
 
 
21
  word2ph = [i for i in word2ph]
22
  phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
23
 
24
+ phone = commons.intersperse(phone, 0)
25
+ tone = commons.intersperse(tone, 0)
26
+ language = commons.intersperse(language, 0)
27
+ for i in range(len(word2ph)):
28
+ word2ph[i] = word2ph[i] * 2
29
+ word2ph[0] += 1
 
30
 
31
  bert_path = wav_path.replace(".wav", ".bert.pt")
32
 
configs/config.json CHANGED
@@ -10,7 +10,7 @@
10
  0.99
11
  ],
12
  "eps": 1e-09,
13
- "batch_size": 24,
14
  "fp16_run": false,
15
  "lr_decay": 0.999875,
16
  "segment_size": 16384,
@@ -35,31 +35,254 @@
35
  "n_speakers": 256,
36
  "cleaned_text": true,
37
  "spk2id": {
38
- "็‡ˆ": 0,
39
- "ใใ‚ˆ": 1,
40
- "็ฅฅๅญ": 2,
41
- "็ซ‹ๅธŒ": 3,
42
- "็ฆ": 4,
43
- "ๆ„›้Ÿณ": 5,
44
- "็ฅž็ง˜ไบบ": 6,
45
- "้ฆ™ๆพ„": 7,
46
- "ๆฒ™็ถพ": 8,
47
- "ๆฅฝๅฅˆ": 9,
48
- "ไธ€ๅŒ": 10,
49
- "ๆตท้ˆด": 11,
50
- "ใซใ‚ƒใ‚€": 12,
51
- "ใƒขใ‚ซ": 13,
52
- "่˜ญ": 14,
53
- "ใ‚Šใฟ": 15,
54
- "ๆœ‰ๅ’ฒ": 16,
55
- "ๅ‡›ใ€…ๅญ": 17,
56
- "ๅˆ่ฏ": 18,
57
- "ใฒใพใ‚Š": 19,
58
- "ใคใใฟ": 20,
59
- "ๅทด": 21,
60
- "ใƒญใƒƒใ‚ฏ": 22,
61
- "ใ‚ใ“": 23,
62
- "ใ‚ชใƒผใƒŠใƒผ": 24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  }
64
  },
65
  "model": {
@@ -116,4 +339,4 @@
116
  "use_spectral_norm": false,
117
  "gin_channels": 256
118
  }
119
- }
 
10
  0.99
11
  ],
12
  "eps": 1e-09,
13
+ "batch_size": 8,
14
  "fp16_run": false,
15
  "lr_decay": 0.999875,
16
  "segment_size": 16384,
 
35
  "n_speakers": 256,
36
  "cleaned_text": true,
37
  "spk2id": {
38
+ "ไธนๆ’": 0,
39
+ "ๅ…‹ๆ‹‰ๆ‹‰": 1,
40
+ "็ฉน": 2,
41
+ "ใ€Œไฟกไฝฟใ€": 3,
42
+ "ๅฒ็“ฆ็ฝ—": 4,
43
+ "ๅฝฆๅฟ": 5,
44
+ "ๆ™ด้œ“": 6,
45
+ "ๆฐๅธ•ๅพท": 7,
46
+ "็ด ่ฃณ": 8,
47
+ "็ปฟ่Š™่“‰": 9,
48
+ "็ฝ—ๅˆน": 10,
49
+ "่‰พไธๅฆฒ": 11,
50
+ "้ป‘ๅก”": 12,
51
+ "ไธนๆžข": 13,
52
+ "ๅธŒ้œฒ็“ฆ": 14,
53
+ "็™ฝ้œฒ": 15,
54
+ "่ดนๆ–ฏๆ›ผ": 16,
55
+ "ๅœไบ‘": 17,
56
+ "ๅฏๅฏๅˆฉไบš": 18,
57
+ "ๆ™ฏๅ…ƒ": 19,
58
+ "่žบไธๅ’•ๅง†": 20,
59
+ "้’้•ž": 21,
60
+ "ๅ…ฌ่พ“ๅธˆๅ‚…": 22,
61
+ "ๅก่Š™ๅก": 23,
62
+ "ๅคงๆฏซ": 24,
63
+ "้ฉญ็ฉบ": 25,
64
+ "ๅŠๅค": 26,
65
+ "ๅฅฅๅˆ—ๆ ผ": 27,
66
+ "ๅจœๅก”่ŽŽ": 28,
67
+ "ๆก‘ๅš": 29,
68
+ "็“ฆๅฐ”็‰น": 30,
69
+ "้˜ฟๅ…ฐ": 31,
70
+ "ไผฆ็บณๅพท": 32,
71
+ "ไฝฉๆ‹‰": 33,
72
+ "ๅกๆณข็‰น": 34,
73
+ "ๅธ•ๅง†": 35,
74
+ "ๅธ•ๆ–ฏๅก": 36,
75
+ "้’้›€": 37,
76
+ "ไธ‰ๆœˆไธƒ": 38,
77
+ "ๅˆƒ": 39,
78
+ "ๅงฌๅญ": 40,
79
+ "ๅธƒๆด›ๅฆฎๅจ…": 41,
80
+ "ๅธŒๅ„ฟ": 42,
81
+ "ๆ˜Ÿ": 43,
82
+ "็ฌฆ็Ž„": 44,
83
+ "่™Žๅ…‹": 45,
84
+ "้“ถ็‹ผ": 46,
85
+ "้•œๆต": 47,
86
+ "ใ€Œๅšๅฃซใ€": 48,
87
+ "ใ€Œๅคง่‚‰ไธธใ€": 49,
88
+ "ไนๆก่ฃŸ็ฝ—": 50,
89
+ "ไฝ่ฅฟๆ‘ฉๆ–ฏ": 51,
90
+ "ๅˆปๆ™ด": 52,
91
+ "ๅšๆ˜“": 53,
92
+ "ๅก็ปด": 54,
93
+ "ๅฏ่Ž‰": 55,
94
+ "ๅ˜‰็Ž›": 56,
95
+ "ๅŸƒ่ˆๅฐ”": 57,
96
+ "ๅก”ๆฐยทๆ‹‰ๅพทๅกๅฐผ": 58,
97
+ "ๅคงๆ…ˆๆ ‘็Ž‹": 59,
98
+ "ๅฎตๅฎซ": 60,
99
+ "ๅบท็บณ": 61,
100
+ "ๅฝฑ": 62,
101
+ "ๆžซๅŽŸไธ‡ๅถ": 63,
102
+ "ๆฌง่ฒๅฆฎ": 64,
103
+ "็Ž›ไน”ไธฝ": 65,
104
+ "็Š็‘š": 66,
105
+ "็”ฐ้“ๅ˜ด": 67,
106
+ "็ ‚็ณ–": 68,
107
+ "็ฅž้‡Œ็ปซๅŽ": 69,
108
+ "็ฝ—่ŽŽ่Ž‰ไบš": 70,
109
+ "่’ๆณทไธ€ๆ–—": 71,
110
+ "่ŽŽๆ‹‰": 72,
111
+ "่ฟชๅธŒ้›…": 73,
112
+ "้’Ÿ็ฆป": 74,
113
+ "้˜ฟๅœ†": 75,
114
+ "้˜ฟๅจœ่€ถ": 76,
115
+ "้˜ฟๆ‹‰ๅคซ": 77,
116
+ "้›ทๆณฝ": 78,
117
+ "้ฆ™่ฑ": 79,
118
+ "้พ™ไบŒ": 80,
119
+ "ใ€Œๅ…ฌๅญใ€": 81,
120
+ "ใ€Œ็™ฝ่€ๅ…ˆ็”Ÿใ€": 82,
121
+ "ไผ˜่ˆ": 83,
122
+ "ๅ‡ฏ็‘Ÿ็ณ": 84,
123
+ "ๅ“ฒๅนณ": 85,
124
+ "ๅคๆด›่’‚": 86,
125
+ "ๅฎ‰ๆŸ": 87,
126
+ "ๅทด่พพ็ปด": 88,
127
+ "ๅผๅคงๅฐ†": 89,
128
+ "ๆ–ฏๅฆๅˆฉ": 90,
129
+ "ๆฏ—ไผฝๅฐ”": 91,
130
+ "ๆตทๅฆฎ่€ถ": 92,
131
+ "็ˆฑๅพท็ณ": 93,
132
+ "็บณ่ฅฟๅฆฒ": 94,
133
+ "่€ๅญŸ": 95,
134
+ "่Š™ๅฎๅจœ": 96,
135
+ "้˜ฟๅฎˆ": 97,
136
+ "้˜ฟ็ฅ‡": 98,
137
+ "ไธนๅ‰ๅฐ”": 99,
138
+ "ไธฝ่ŽŽ": 100,
139
+ "ไบ”้ƒŽ": 101,
140
+ "ๅ…ƒๅคช": 102,
141
+ "ๅ…‹ๅˆ—้—จ็‰น": 103,
142
+ "ๅ…‹็ฝ—็ดข": 104,
143
+ "ๅŒ—ๆ–—": 105,
144
+ "ๅŸƒๅ‹’ๆ›ผ": 106,
145
+ "ๅคฉ็›ฎๅไบ”": 107,
146
+ "ๅฅฅๅ…น": 108,
147
+ "ๆถ้พ™": 109,
148
+ "ๆ—ฉๆŸš": 110,
149
+ "ๆœๆ‹‰ๅคซ": 111,
150
+ "ๆพๆตฆ": 112,
151
+ "ๆŸŠๅƒ้‡Œ": 113,
152
+ "็”˜้›จ": 114,
153
+ "็Ÿณๅคด": 115,
154
+ "็บฏๆฐด็ฒพ็ต๏ผŸ": 116,
155
+ "็พฝ็”Ÿ็”ฐๅƒ้นค": 117,
156
+ "่Žฑไพๆ‹‰": 118,
157
+ "่ฒ่ฐขๅฐ”": 119,
158
+ "่จ€็ฌ‘": 120,
159
+ "่ฏบ่‰พๅฐ”": 121,
160
+ "่ต›่ฏบ": 122,
161
+ "่พ›็„ฑ": 123,
162
+ "่ฟชๅจœๆณฝ้ป›": 124,
163
+ "้‚ฃ็ปด่Žฑ็‰น": 125,
164
+ "ๅ…ซ้‡็ฅžๅญ": 126,
165
+ "ๅ‡ฏไบš": 127,
166
+ "ๅด่ˆน้•ฟ": 128,
167
+ "ๅŸƒๅพท": 129,
168
+ "ๅคฉๅ”": 130,
169
+ "ๅฅณๅฃซ": 131,
170
+ "ๆ•็ญ ": 132,
171
+ "ๆ็บณ้‡Œ": 133,
172
+ "ๆดพ่’™": 134,
173
+ "ๆตๆตช่€…": 135,
174
+ "ๆทฑๆธŠไฝฟๅพ’": 136,
175
+ "็Ž›ๆ ผไธฝ็‰น": 137,
176
+ "็้œฒ็Š": 138,
177
+ "็ด": 139,
178
+ "็‘ถ็‘ถ": 140,
179
+ "็•™ไบ‘ๅ€Ÿ้ฃŽ็œŸๅ›": 141,
180
+ "็ปฎ่‰ฏ่‰ฏ": 142,
181
+ "่ˆ’ไผฏ็‰น": 143,
182
+ "่ง": 144,
183
+ "่Žซๅจœ": 145,
184
+ "่กŒ็ง‹": 146,
185
+ "่ฟˆๅ‹’ๆ–ฏ": 147,
186
+ "้˜ฟไฝฉๆ™ฎ": 148,
187
+ "้นฟ้‡Žๅฅˆๅฅˆ": 149,
188
+ "ไธƒไธƒ": 150,
189
+ "ไผŠ่ฟชๅจ…": 151,
190
+ "ๅšๆฅ": 152,
191
+ "ๅŽ่’‚ไธ": 153,
192
+ "ๅŸƒๅฐ”ๆฌฃๆ น": 154,
193
+ "ๅŸƒๆณฝ": 155,
194
+ "ๅกž็‰ๆ–ฏ": 156,
195
+ "ๅคœๅ…ฐ": 157,
196
+ "ๅธธไน็ˆท": 158,
197
+ "ๆ‚ฆ": 159,
198
+ "ๆˆดๅ› ๆ–ฏ้›ทๅธƒ": 160,
199
+ "็ฌผ้’“็“ถไธ€ๅฟƒ": 161,
200
+ "็บณๆฏ”ๅฐ”": 162,
201
+ "่ƒกๆกƒ": 163,
202
+ "่‰พๅฐ”ๆตทๆฃฎ": 164,
203
+ "่‰พ่Ž‰ไธ": 165,
204
+ "่ฒ็ฑณๅฐผ": 166,
205
+ "่’‚็Ž›ไนŒๆ–ฏ": 167,
206
+ "่ฟชๅฅฅๅจœ": 168,
207
+ "้˜ฟๆ™ƒ": 169,
208
+ "้˜ฟๆด›็“ฆ": 170,
209
+ "้™†่กŒๅฒฉๆœฌ็œŸ่•ˆยทๅ…ƒ็ด ็”Ÿๅ‘ฝ": 171,
210
+ "้›ท็”ตๅฐ†ๅ†›": 172,
211
+ "้ญˆ": 173,
212
+ "้นฟ้‡Ž้™ขๅนณ่—": 174,
213
+ "ใ€Œๅฅณๅฃซใ€": 175,
214
+ "ใ€Œๆ•ฃๅ…ตใ€": 176,
215
+ "ๅ‡ๅ…‰": 177,
216
+ "ๅฆฎ้œฒ": 178,
217
+ "ๅจœ็ปดๅจ…": 179,
218
+ "ๅฎ›็ƒŸ": 180,
219
+ "ๆ…งๅฟƒ": 181,
220
+ "ๆ‰˜ๅ…‹": 182,
221
+ "ๆ‰˜้ฉฌ": 183,
222
+ "ๆŽ‡ๆ˜Ÿๆ”ซ่พฐๅคฉๅ›": 184,
223
+ "ๆ—็™ฝ": 185,
224
+ "ๆตฎๆธธๆฐด่•ˆๅ…ฝยทๅ…ƒ็ด ็”Ÿๅ‘ฝ": 186,
225
+ "็ƒŸ็ปฏ": 187,
226
+ "็Ž›ๅกžๅ‹’": 188,
227
+ "็™พ้—ป": 189,
228
+ "็Ÿฅๆ˜“": 190,
229
+ "็ฑณๅก": 191,
230
+ "่ฅฟๆ‹‰ๆฐ": 192,
231
+ "่ฟชๅขๅ…‹": 193,
232
+ "้‡ไบ‘": 194,
233
+ "้˜ฟๆ‰Žๅฐ”": 195,
234
+ "้œๅคซๆ›ผ": 196,
235
+ "ไธŠๆ‰": 197,
236
+ "ไน…ๅˆฉ้กป": 198,
237
+ "ๅ˜‰่‰ฏ": 199,
238
+ "ๅ›žๅฃฐๆตท่žบ": 200,
239
+ "ๅคš่Ž‰": 201,
240
+ "ๅฎ‰่ฅฟ": 202,
241
+ "ๅพทๆฒƒๆฒ™ๅ…‹": 203,
242
+ "ๆ‹‰่ตซๆ›ผ": 204,
243
+ "ๆž—ๅฐผ": 205,
244
+ "ๆŸฅๅฐ”ๆ–ฏ": 206,
245
+ "ๆทฑๆธŠๆณ•ๅธˆ": 207,
246
+ "ๆธฉ่ฟช": 208,
247
+ "็ˆฑ่ดๅฐ”": 209,
248
+ "็Š็‘šๅฎซๅฟƒๆตท": 210,
249
+ "็ญๅฐผ็‰น": 211,
250
+ "็ณๅฆฎ็‰น": 212,
251
+ "็”ณ้นค": 213,
252
+ "็ฅž้‡Œ็ปซไบบ": 214,
253
+ "่‰พไผฏ็‰น": 215,
254
+ "่ๅงฅๅงฅ": 216,
255
+ "่จ่ตซๅ“ˆ่’‚": 217,
256
+ "่จ้ฝๅ› ": 218,
257
+ "้˜ฟๅฐ”ๅก็ฑณ": 219,
258
+ "้˜ฟ่ดๅคš": 220,
259
+ "anzai": 221,
260
+ "ไน…ๅฒๅฟ": 222,
261
+ "ไนๆก้•ฐๆฒป": 223,
262
+ "ไบ‘ๅ ‡": 224,
263
+ "ไผŠๅˆฉไบšๆ–ฏ": 225,
264
+ "ๅŸƒๆด›ไผŠ": 226,
265
+ "ๅกžๅก”่•พ": 227,
266
+ "ๆ‹‰้ฝ": 228,
267
+ "ๆ˜†้’ง": 229,
268
+ "ๆŸฏ่Žฑ": 230,
269
+ "ๆฒ™ๆ‰Žๆ›ผ": 231,
270
+ "ๆตท่Šญๅค": 232,
271
+ "็™ฝๆœฏ": 233,
272
+ "็ฉบ": 234,
273
+ "่‰พๆ–‡": 235,
274
+ "่Šญ่Šญๆ‹‰": 236,
275
+ "่ŽซๅกžไผŠๆ€": 237,
276
+ "่Žบๅ„ฟ": 238,
277
+ "่พพ่พพๅˆฉไบš": 239,
278
+ "่ฟˆ่’™": 240,
279
+ "้•ฟ็”Ÿ": 241,
280
+ "้˜ฟๅทดๅ›พไผŠ": 242,
281
+ "้™†ๆ™ฏๅ’Œ": 243,
282
+ "่Žซๅผˆ": 244,
283
+ "ๅคๅฝฆ": 245,
284
+ "ๅทฆ็„ถ": 246,
285
+ "ๆ ‡่ด": 247
286
  }
287
  },
288
  "model": {
 
339
  "use_spectral_norm": false,
340
  "gin_channels": 256
341
  }
342
+ }
data_utils.py CHANGED
@@ -155,7 +155,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
155
  if language_str == "ZH":
156
  bert = bert
157
  ja_bert = torch.zeros(768, len(phone))
158
- elif language_str == "JA":
159
  ja_bert = bert
160
  bert = torch.zeros(1024, len(phone))
161
  else:
 
155
  if language_str == "ZH":
156
  bert = bert
157
  ja_bert = torch.zeros(768, len(phone))
158
+ elif language_str == "JP":
159
  ja_bert = bert
160
  bert = torch.zeros(1024, len(phone))
161
  else:
filelists/esd.list ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Example:
2
+ {wav_path}|{speaker_name}|{language}|{text}
3
+ ๆดพ่’™_1.wav|ๆดพ่’™|ZH|ๅ‰้ข็š„ๅŒบๅŸŸ๏ผŒไปฅๅŽๅ†ๆฅๆŽข็ดขๅง๏ผ
image/41JjBPWdHtL._SX342_SY445_.jpg ADDED
image/41JjBPWdHtL.jpg ADDED
logs/Bangdream/G_7000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92e3ea6239c8f2b16efff571ba07232dd5de71067d2fc87e3f2e0ef490e2d7eb
3
+ size 857912686
logs/Bangdream/config.json ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 52,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0003,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 16,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 16384,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "skip_optimizer": true
22
+ },
23
+ "data": {
24
+ "training_files": "filelists/train.list",
25
+ "validation_files": "filelists/val.list",
26
+ "max_wav_value": 32768.0,
27
+ "sampling_rate": 44100,
28
+ "filter_length": 2048,
29
+ "hop_length": 512,
30
+ "win_length": 2048,
31
+ "n_mel_channels": 128,
32
+ "mel_fmin": 0.0,
33
+ "mel_fmax": null,
34
+ "add_blank": true,
35
+ "n_speakers": 256,
36
+ "cleaned_text": true,
37
+ "spk2id": {
38
+ "ไธ‰ๆœˆไธƒ": 0,
39
+ "้ฆ™ๆพ„": 1,
40
+ "ๆœ‰ๅ’ฒ": 2,
41
+ "ๆฒ™็ถพ": 3,
42
+ "ใ‚Šใฟ": 4,
43
+ "ใŸใˆ": 5,
44
+ "ๆฒ™็ถพใ€ใ‚Šใฟใ€ใŸใˆ": 6,
45
+ "ๅทด": 7,
46
+ "ไธ€ๅŒ": 8,
47
+ "ใพใ‚Šใช": 9,
48
+ "ใ‚†ใ‚Š": 10,
49
+ "ๆ˜Žๆ—ฅ้ฆ™": 11,
50
+ "๏ผŸ๏ผŸ๏ผŸ": 12,
51
+ "ใฒใพใ‚Š": 13,
52
+ "ใƒขใ‚ซ": 14,
53
+ "ใคใใฟ": 15,
54
+ "่˜ญ": 16,
55
+ "ใƒชใ‚ต": 17,
56
+ "ๅƒ่–": 18,
57
+ "่Šฑ้Ÿณ": 19,
58
+ "ใ‚คใƒด": 20,
59
+ "ๆ—ฅ่œ": 21,
60
+ "ๅ‹ๅธŒ้‚ฃ": 22,
61
+ "็ด—ๅคœ": 23,
62
+ "ใ“ใ“ใ‚": 24,
63
+ "็พŽๅ’ฒ": 25,
64
+ "่–ซ": 26,
65
+ "ใฏใใฟ": 27,
66
+ "ใƒŸใƒƒใ‚ทใ‚งใƒซ": 28,
67
+ "ใƒžใƒชใƒผ": 29,
68
+ "ๆ€ช็›—ใƒใƒญใƒใƒƒใƒ”ใƒผ": 30,
69
+ "ใƒ‹ใ‚ณใƒชใƒผใƒŠ": 31,
70
+ "ๅฝฉ": 32,
71
+ "้บปๅผฅ": 33,
72
+ "็‡ๅญ": 34,
73
+ "ใ‚ใ“": 35,
74
+ "ใ‚†ใใช": 36,
75
+ "ใพใ—ใ‚": 37,
76
+ "ใคใใ—": 38,
77
+ "้€ๅญ": 39,
78
+ "ไธƒๆทฑ": 40,
79
+ "็‘ ๅ”ฏ": 41,
80
+ "ๅ…ญ่Šฑ": 42,
81
+ "ใƒ‘ใƒฌใ‚ช": 43,
82
+ "ใƒฌใ‚คใƒค": 44,
83
+ "ใƒžใ‚นใ‚ญใƒณใ‚ฐ": 45,
84
+ "ใƒใƒฅใƒใƒฅ": 46,
85
+ "ใพใ™ใ": 47,
86
+ "ใƒญใƒƒใ‚ฏ": 48,
87
+ "ไปค็Ž‹้‚ฃ": 49,
88
+ "CHIYU": 50,
89
+ "ใƒฌใ‚ค": 51,
90
+ "็‡ˆ": 52,
91
+ "ใใ‚ˆ": 53,
92
+ "็ฅฅๅญ": 54,
93
+ "็ซ‹ๅธŒ": 55,
94
+ "็ฆ": 56,
95
+ "ๆ„›้Ÿณ": 57,
96
+ "ๆฅฝๅฅˆ": 58,
97
+ "ๆตท้ˆด": 59
98
+ }
99
+ },
100
+ "model": {
101
+ "use_spk_conditioned_encoder": true,
102
+ "use_noise_scaled_mas": true,
103
+ "use_mel_posterior_encoder": false,
104
+ "use_duration_discriminator": true,
105
+ "inter_channels": 192,
106
+ "hidden_channels": 192,
107
+ "filter_channels": 768,
108
+ "n_heads": 2,
109
+ "n_layers": 6,
110
+ "kernel_size": 3,
111
+ "p_dropout": 0.1,
112
+ "resblock": "1",
113
+ "resblock_kernel_sizes": [
114
+ 3,
115
+ 7,
116
+ 11
117
+ ],
118
+ "resblock_dilation_sizes": [
119
+ [
120
+ 1,
121
+ 3,
122
+ 5
123
+ ],
124
+ [
125
+ 1,
126
+ 3,
127
+ 5
128
+ ],
129
+ [
130
+ 1,
131
+ 3,
132
+ 5
133
+ ]
134
+ ],
135
+ "upsample_rates": [
136
+ 8,
137
+ 8,
138
+ 2,
139
+ 2,
140
+ 2
141
+ ],
142
+ "upsample_initial_channel": 512,
143
+ "upsample_kernel_sizes": [
144
+ 16,
145
+ 16,
146
+ 8,
147
+ 2,
148
+ 2
149
+ ],
150
+ "n_layers_q": 3,
151
+ "use_spectral_norm": false,
152
+ "gin_channels": 256
153
+ }
154
+ }
models.py CHANGED
@@ -763,7 +763,7 @@ class SynthesizerTrn(nn.Module):
763
  gin_channels=256,
764
  use_sdp=True,
765
  n_flow_layer=4,
766
- n_layers_trans_flow=4,
767
  flow_share_parameter=False,
768
  use_transformer_flow=True,
769
  **kwargs
 
763
  gin_channels=256,
764
  use_sdp=True,
765
  n_flow_layer=4,
766
+ n_layers_trans_flow=6,
767
  flow_share_parameter=False,
768
  use_transformer_flow=True,
769
  **kwargs
monotonic_align/__pycache__/__init__.cpython-39.pyc CHANGED
Binary files a/monotonic_align/__pycache__/__init__.cpython-39.pyc and b/monotonic_align/__pycache__/__init__.cpython-39.pyc differ
 
monotonic_align/__pycache__/core.cpython-39.pyc CHANGED
Binary files a/monotonic_align/__pycache__/core.cpython-39.pyc and b/monotonic_align/__pycache__/core.cpython-39.pyc differ
 
preprocess_text.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
  from collections import defaultdict
3
  from random import shuffle
4
  from typing import Optional
@@ -11,7 +12,7 @@ from text.cleaner import clean_text
11
  @click.command()
12
  @click.option(
13
  "--transcription-path",
14
- default="filelists/Mygo.list",
15
  type=click.Path(exists=True, file_okay=True, dir_okay=False),
16
  )
17
  @click.option("--cleaned-path", default=None)
@@ -67,13 +68,27 @@ def main(
67
  current_sid = 0
68
 
69
  with open(transcription_path, encoding="utf-8") as f:
 
 
 
70
  for line in f.readlines():
71
  utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
 
 
 
 
 
 
 
 
 
 
72
  spk_utt_map[spk].append(line)
73
 
74
  if spk not in spk_id_map.keys():
75
  spk_id_map[spk] = current_sid
76
  current_sid += 1
 
77
 
78
  train_list = []
79
  val_list = []
 
1
  import json
2
+ import os.path
3
  from collections import defaultdict
4
  from random import shuffle
5
  from typing import Optional
 
12
  @click.command()
13
  @click.option(
14
  "--transcription-path",
15
+ default="filelists/genshin.list",
16
  type=click.Path(exists=True, file_okay=True, dir_okay=False),
17
  )
18
  @click.option("--cleaned-path", default=None)
 
68
  current_sid = 0
69
 
70
  with open(transcription_path, encoding="utf-8") as f:
71
+ audioPaths = set()
72
+ countSame = 0
73
+ countNotFound = 0
74
  for line in f.readlines():
75
  utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
76
+ if utt in audioPaths:
77
+ # ่ฟ‡ๆปคๆ•ฐๆฎ้›†้”™่ฏฏ๏ผš็›ธๅŒ็š„้Ÿณ้ข‘ๅŒน้…ๅคšไธชๆ–‡ๆœฌ๏ผŒๅฏผ่‡ดๅŽ็ปญbertๅ‡บ้—ฎ้ข˜
78
+ print(f"้‡ๅค้Ÿณ้ข‘ๆ–‡ๆœฌ๏ผš{line}")
79
+ countSame += 1
80
+ continue
81
+ if not os.path.isfile(utt):
82
+ print(f"ๆฒกๆœ‰ๆ‰พๅˆฐๅฏนๅบ”็š„้Ÿณ้ข‘๏ผš{utt}")
83
+ countNotFound += 1
84
+ continue
85
+ audioPaths.add(utt)
86
  spk_utt_map[spk].append(line)
87
 
88
  if spk not in spk_id_map.keys():
89
  spk_id_map[spk] = current_sid
90
  current_sid += 1
91
+ print(f"ๆ€ป้‡ๅค้Ÿณ้ข‘ๆ•ฐ๏ผš{countSame}๏ผŒๆ€ปๆœชๆ‰พๅˆฐ็š„้Ÿณ้ข‘ๆ•ฐ:{countNotFound}")
92
 
93
  train_list = []
94
  val_list = []
requirements.txt CHANGED
@@ -21,6 +21,3 @@ unidic-lite
21
  cmudict
22
  fugashi
23
  num2words
24
- PyPDF2
25
- ebooklib
26
- beautifulsoup4
 
21
  cmudict
22
  fugashi
23
  num2words
 
 
 
text/__init__.py CHANGED
@@ -1,6 +1,5 @@
1
  from text.symbols import *
2
 
3
-
4
  _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5
 
6
 
 
1
  from text.symbols import *
2
 
 
3
  _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4
 
5
 
text/__pycache__/__init__.cpython-39.pyc CHANGED
Binary files a/text/__pycache__/__init__.cpython-39.pyc and b/text/__pycache__/__init__.cpython-39.pyc differ
 
text/__pycache__/chinese.cpython-39.pyc CHANGED
Binary files a/text/__pycache__/chinese.cpython-39.pyc and b/text/__pycache__/chinese.cpython-39.pyc differ
 
text/__pycache__/chinese_bert.cpython-39.pyc CHANGED
Binary files a/text/__pycache__/chinese_bert.cpython-39.pyc and b/text/__pycache__/chinese_bert.cpython-39.pyc differ
 
text/__pycache__/cleaner.cpython-39.pyc CHANGED
Binary files a/text/__pycache__/cleaner.cpython-39.pyc and b/text/__pycache__/cleaner.cpython-39.pyc differ
 
text/__pycache__/english_bert_mock.cpython-39.pyc CHANGED
Binary files a/text/__pycache__/english_bert_mock.cpython-39.pyc and b/text/__pycache__/english_bert_mock.cpython-39.pyc differ
 
text/__pycache__/japanese.cpython-39.pyc CHANGED
Binary files a/text/__pycache__/japanese.cpython-39.pyc and b/text/__pycache__/japanese.cpython-39.pyc differ
 
text/__pycache__/japanese_bert.cpython-39.pyc CHANGED
Binary files a/text/__pycache__/japanese_bert.cpython-39.pyc and b/text/__pycache__/japanese_bert.cpython-39.pyc differ
 
text/__pycache__/symbols.cpython-39.pyc CHANGED
Binary files a/text/__pycache__/symbols.cpython-39.pyc and b/text/__pycache__/symbols.cpython-39.pyc differ
 
text/__pycache__/tone_sandhi.cpython-39.pyc CHANGED
Binary files a/text/__pycache__/tone_sandhi.cpython-39.pyc and b/text/__pycache__/tone_sandhi.cpython-39.pyc differ
 
train_ms.py CHANGED
@@ -42,12 +42,6 @@ torch.backends.cuda.enable_mem_efficient_sdp(
42
  torch.backends.cuda.enable_math_sdp(True)
43
  global_step = 0
44
 
45
- import os
46
-
47
- os.environ['MASTER_ADDR'] = '127.0.0.1'
48
- os.environ['MASTER_PORT'] = '8880'
49
- os.environ['WORLD_SIZE'] = '1'
50
- os.environ['RANK'] = '0'
51
 
52
  def run():
53
  dist.init_process_group(
@@ -197,6 +191,8 @@ def run():
197
  optim_g.param_groups[0]["initial_lr"] = g_resume_lr
198
  if not optim_d.param_groups[0].get("initial_lr"):
199
  optim_d.param_groups[0]["initial_lr"] = d_resume_lr
 
 
200
 
201
  epoch_str = max(epoch_str, 1)
202
  global_step = (epoch_str - 1) * len(train_loader)
 
42
  torch.backends.cuda.enable_math_sdp(True)
43
  global_step = 0
44
 
 
 
 
 
 
 
45
 
46
  def run():
47
  dist.init_process_group(
 
191
  optim_g.param_groups[0]["initial_lr"] = g_resume_lr
192
  if not optim_d.param_groups[0].get("initial_lr"):
193
  optim_d.param_groups[0]["initial_lr"] = d_resume_lr
194
+ if not optim_dur_disc.param_groups[0].get("initial_lr"):
195
+ optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr
196
 
197
  epoch_str = max(epoch_str, 1)
198
  global_step = (epoch_str - 1) * len(train_loader)
utils.py CHANGED
@@ -206,15 +206,14 @@ def get_hparams(init=True):
206
  config_path = args.config
207
  config_save_path = os.path.join(model_dir, "config.json")
208
  if init:
209
- with open(config_path, "r") as f:
210
  data = f.read()
211
- with open(config_save_path, "w") as f:
212
  f.write(data)
213
  else:
214
- with open(config_save_path, "r") as f:
215
  data = f.read()
216
  config = json.loads(data)
217
-
218
  hparams = HParams(**config)
219
  hparams.model_dir = model_dir
220
  return hparams
 
206
  config_path = args.config
207
  config_save_path = os.path.join(model_dir, "config.json")
208
  if init:
209
+ with open(config_path, "r", encoding="utf-8") as f:
210
  data = f.read()
211
+ with open(config_save_path, "w", encoding="utf-8") as f:
212
  f.write(data)
213
  else:
214
+ with open(config_save_path, "r", vencoding="utf-8") as f:
215
  data = f.read()
216
  config = json.loads(data)
 
217
  hparams = HParams(**config)
218
  hparams.model_dir = model_dir
219
  return hparams
webui.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa: E402
2
+
3
+ import sys, os
4
+ import logging
5
+
6
+ logging.getLogger("numba").setLevel(logging.WARNING)
7
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
8
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
9
+ logging.getLogger("matplotlib").setLevel(logging.WARNING)
10
+
11
+ logging.basicConfig(
12
+ level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s"
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ import torch
18
+ import argparse
19
+ import commons
20
+ import utils
21
+ from models import SynthesizerTrn
22
+ from text.symbols import symbols
23
+ from text import cleaned_text_to_sequence, get_bert
24
+ from text.cleaner import clean_text
25
+ import gradio as gr
26
+ import webbrowser
27
+ import numpy as np
28
+
29
+ net_g = None
30
+
31
+ if sys.platform == "darwin" and torch.backends.mps.is_available():
32
+ device = "mps"
33
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
34
+ else:
35
+ device = "cuda"
36
+
37
+
38
+ def get_text(text, language_str, hps):
39
+ norm_text, phone, tone, word2ph = clean_text(text, language_str)
40
+ phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
41
+
42
+ if hps.data.add_blank:
43
+ phone = commons.intersperse(phone, 0)
44
+ tone = commons.intersperse(tone, 0)
45
+ language = commons.intersperse(language, 0)
46
+ for i in range(len(word2ph)):
47
+ word2ph[i] = word2ph[i] * 2
48
+ word2ph[0] += 1
49
+ bert = get_bert(norm_text, word2ph, language_str, device)
50
+ del word2ph
51
+ assert bert.shape[-1] == len(phone), phone
52
+
53
+ if language_str == "ZH":
54
+ bert = bert
55
+ ja_bert = torch.zeros(768, len(phone))
56
+ elif language_str == "JP":
57
+ ja_bert = bert
58
+ bert = torch.zeros(1024, len(phone))
59
+ else:
60
+ bert = torch.zeros(1024, len(phone))
61
+ ja_bert = torch.zeros(768, len(phone))
62
+
63
+ assert bert.shape[-1] == len(
64
+ phone
65
+ ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
66
+
67
+ phone = torch.LongTensor(phone)
68
+ tone = torch.LongTensor(tone)
69
+ language = torch.LongTensor(language)
70
+ return bert, ja_bert, phone, tone, language
71
+
72
+
73
+ def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, language):
74
+ global net_g
75
+ bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps)
76
+ with torch.no_grad():
77
+ x_tst = phones.to(device).unsqueeze(0)
78
+ tones = tones.to(device).unsqueeze(0)
79
+ lang_ids = lang_ids.to(device).unsqueeze(0)
80
+ bert = bert.to(device).unsqueeze(0)
81
+ ja_bert = ja_bert.to(device).unsqueeze(0)
82
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
83
+ del phones
84
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
85
+ audio = (
86
+ net_g.infer(
87
+ x_tst,
88
+ x_tst_lengths,
89
+ speakers,
90
+ tones,
91
+ lang_ids,
92
+ bert,
93
+ ja_bert,
94
+ sdp_ratio=sdp_ratio,
95
+ noise_scale=noise_scale,
96
+ noise_scale_w=noise_scale_w,
97
+ length_scale=length_scale,
98
+ )[0][0, 0]
99
+ .data.cpu()
100
+ .float()
101
+ .numpy()
102
+ )
103
+ del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
104
+ torch.cuda.empty_cache()
105
+ return audio
106
+
107
+
108
+ def tts_fn(
109
+ text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language
110
+ ):
111
+ slices = text.split("|")
112
+ audio_list = []
113
+ with torch.no_grad():
114
+ for slice in slices:
115
+ audio = infer(
116
+ slice,
117
+ sdp_ratio=sdp_ratio,
118
+ noise_scale=noise_scale,
119
+ noise_scale_w=noise_scale_w,
120
+ length_scale=length_scale,
121
+ sid=speaker,
122
+ language=language,
123
+ )
124
+ audio_list.append(audio)
125
+ silence = np.zeros(hps.data.sampling_rate) # ็”Ÿๆˆ1็ง’็š„้™้Ÿณ
126
+ audio_list.append(silence) # ๅฐ†้™้ŸณๆทปๅŠ ๅˆฐๅˆ—่กจไธญ
127
+ audio_concat = np.concatenate(audio_list)
128
+ return "Success", (hps.data.sampling_rate, audio_concat)
129
+
130
+
131
+ if __name__ == "__main__":
132
+ parser = argparse.ArgumentParser()
133
+ parser.add_argument(
134
+ "-m", "--model", default="./logs/as/G_8000.pth", help="path of your model"
135
+ )
136
+ parser.add_argument(
137
+ "-c",
138
+ "--config",
139
+ default="./configs/config.json",
140
+ help="path of your config file",
141
+ )
142
+ parser.add_argument(
143
+ "--share", default=False, help="make link public", action="store_true"
144
+ )
145
+ parser.add_argument(
146
+ "-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
147
+ )
148
+
149
+ args = parser.parse_args()
150
+ if args.debug:
151
+ logger.info("Enable DEBUG-LEVEL log")
152
+ logging.basicConfig(level=logging.DEBUG)
153
+ hps = utils.get_hparams_from_file(args.config)
154
+
155
+ device = (
156
+ "cuda:0"
157
+ if torch.cuda.is_available()
158
+ else (
159
+ "mps"
160
+ if sys.platform == "darwin" and torch.backends.mps.is_available()
161
+ else "cpu"
162
+ )
163
+ )
164
+ net_g = SynthesizerTrn(
165
+ len(symbols),
166
+ hps.data.filter_length // 2 + 1,
167
+ hps.train.segment_size // hps.data.hop_length,
168
+ n_speakers=hps.data.n_speakers,
169
+ **hps.model,
170
+ ).to(device)
171
+ _ = net_g.eval()
172
+
173
+ _ = utils.load_checkpoint(args.model, net_g, None, skip_optimizer=True)
174
+
175
+ speaker_ids = hps.data.spk2id
176
+ speakers = list(speaker_ids.keys())
177
+ languages = ["ZH", "JP"]
178
+ with gr.Blocks() as app:
179
+ with gr.Row():
180
+ with gr.Column():
181
+ text = gr.TextArea(
182
+ label="Text",
183
+ placeholder="Input Text Here",
184
+ value="ๅƒ่‘ก่„ไธๅ่‘ก่„็šฎ๏ผŒไธๅƒ่‘ก่„ๅ€’ๅ่‘ก่„็šฎใ€‚",
185
+ )
186
+ speaker = gr.Dropdown(
187
+ choices=speakers, value=speakers[0], label="Speaker"
188
+ )
189
+ sdp_ratio = gr.Slider(
190
+ minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
191
+ )
192
+ noise_scale = gr.Slider(
193
+ minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise Scale"
194
+ )
195
+ noise_scale_w = gr.Slider(
196
+ minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise Scale W"
197
+ )
198
+ length_scale = gr.Slider(
199
+ minimum=0.1, maximum=2, value=1, step=0.1, label="Length Scale"
200
+ )
201
+ language = gr.Dropdown(
202
+ choices=languages, value=languages[0], label="Language"
203
+ )
204
+ btn = gr.Button("Generate!", variant="primary")
205
+ with gr.Column():
206
+ text_output = gr.Textbox(label="Message")
207
+ audio_output = gr.Audio(label="Output Audio")
208
+
209
+ btn.click(
210
+ tts_fn,
211
+ inputs=[
212
+ text,
213
+ speaker,
214
+ sdp_ratio,
215
+ noise_scale,
216
+ noise_scale_w,
217
+ length_scale,
218
+ language,
219
+ ],
220
+ outputs=[text_output, audio_output],
221
+ )
222
+
223
+ webbrowser.open("http://127.0.0.1:7860")
224
+ app.launch(share=args.share)