baibaibai ChrisPreston commited on
Commit
e882f51
0 Parent(s):

Duplicate from ChrisPreston/diff-svc_minato_aqua

Browse files

Co-authored-by: ChrisPreston <ChrisPreston@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. README.md +14 -0
  3. app.py +86 -0
  4. aqua/clean_model_ckpt_steps_100000.ckpt +3 -0
  5. aqua/config.yaml +457 -0
  6. checkpoints/0102_xiaoma_pe/config.yaml +172 -0
  7. checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +3 -0
  8. checkpoints/hubert/hubert.onnx +3 -0
  9. checkpoints/hubert/hubert_soft.pt +3 -0
  10. checkpoints/nsf_hifigan/NOTICE.txt +74 -0
  11. checkpoints/nsf_hifigan/config.json +38 -0
  12. checkpoints/nsf_hifigan/model +3 -0
  13. infer.py +81 -0
  14. infer_tools/__pycache__/f0_static.cpython-38.pyc +0 -0
  15. infer_tools/__pycache__/infer_tool.cpython-38.pyc +0 -0
  16. infer_tools/__pycache__/infer_tool_beta.cpython-38.pyc +0 -0
  17. infer_tools/__pycache__/slicer.cpython-38.pyc +0 -0
  18. infer_tools/__pycache__/trans_key.cpython-38.pyc +0 -0
  19. infer_tools/f0_static.py +116 -0
  20. infer_tools/f0_temp.json +0 -0
  21. infer_tools/infer_tool.py +201 -0
  22. infer_tools/slicer.py +142 -0
  23. infer_tools/trans_key.py +67 -0
  24. modules/__pycache__/encoder.cpython-310.pyc +0 -0
  25. modules/__pycache__/encoder.cpython-38.pyc +0 -0
  26. modules/commons/__pycache__/common_layers.cpython-310.pyc +0 -0
  27. modules/commons/__pycache__/common_layers.cpython-38.pyc +0 -0
  28. modules/commons/__pycache__/ssim.cpython-310.pyc +0 -0
  29. modules/commons/__pycache__/ssim.cpython-38.pyc +0 -0
  30. modules/commons/common_layers.py +675 -0
  31. modules/commons/ssim.py +84 -0
  32. modules/diff/__pycache__/diffusion.cpython-310.pyc +0 -0
  33. modules/diff/__pycache__/diffusion.cpython-38.pyc +0 -0
  34. modules/diff/__pycache__/net.cpython-310.pyc +0 -0
  35. modules/diff/__pycache__/net.cpython-38.pyc +0 -0
  36. modules/diff/diffusion.py +312 -0
  37. modules/diff/net.py +135 -0
  38. modules/encoder.py +208 -0
  39. modules/hubert/__pycache__/cn_hubert.cpython-38.pyc +0 -0
  40. modules/hubert/__pycache__/hubert_model.cpython-38.pyc +0 -0
  41. modules/hubert/__pycache__/hubert_onnx.cpython-38.pyc +0 -0
  42. modules/hubert/cn_hubert.py +40 -0
  43. modules/hubert/hubert_model.py +243 -0
  44. modules/hubert/hubert_onnx.py +19 -0
  45. modules/nsf_hifigan/__pycache__/env.cpython-310.pyc +0 -0
  46. modules/nsf_hifigan/__pycache__/env.cpython-38.pyc +0 -0
  47. modules/nsf_hifigan/__pycache__/models.cpython-310.pyc +0 -0
  48. modules/nsf_hifigan/__pycache__/models.cpython-38.pyc +0 -0
  49. modules/nsf_hifigan/__pycache__/nvSTFT.cpython-310.pyc +0 -0
  50. modules/nsf_hifigan/__pycache__/nvSTFT.cpython-38.pyc +0 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ checkpoints/nsf_hifigan/model filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Diff-svc Minato Aqua
3
+ emoji: 🐨
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: ChrisPreston/diff-svc_minato_aqua
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.hparams import hparams
2
+ import scipy.io.wavfile as wav
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import IPython.display as ipd
6
+ import utils
7
+ import librosa
8
+ import torch
9
+ import torchcrepe
10
+ from infer import *
11
+ import logging
12
+ from infer_tools.infer_tool import *
13
+ import gradio as gr
14
+ import json
15
+
16
+ logging.getLogger('numba').setLevel(logging.WARNING)
17
+ svc_model = None
18
+ project_name = "aqua"
19
+ wave_name = f"./temp.wav"
20
+ model_path = f'./aqua/clean_model_ckpt_steps_100000.ckpt'
21
+ config_path = f'./aqua/config.yaml'
22
+ spk_id = "aqua"
23
+
24
+ def infer(wav_fn, tran, accelerate, auto_key):
25
+ model = Svc(project_name, config_path, hubert_gpu=False, model_path=model_path, onnx=False)
26
+
27
+ if wav_fn is not None:
28
+ audio_path = wav_fn
29
+ else:
30
+ return "请先上传wav格式的音频文件", None, None
31
+ run_clip(raw_audio_path=audio_path, svc_model=model, key=tran, acc=accelerate, use_crepe=True,
32
+ spk_id=spk_id, auto_key=auto_key, project_name=project_name, out_path=wave_name)
33
+
34
+ au_out = wave_name
35
+
36
+ return "转换成功", au_out
37
+
38
+ app = gr.Blocks()
39
+ with app:
40
+ with gr.Tabs():
41
+ with gr.TabItem("推理"):
42
+ with gr.Blocks():
43
+ with gr.Blocks():
44
+ with gr.Box():
45
+ gr.Markdown(value="""**上传音频**""")
46
+ with gr.Row():
47
+ upload_input = gr.Audio(source="upload", label="源音频", type="filepath", elem_id="audio_inputs")
48
+ out_audio = gr.Audio(label="输出音频")
49
+ with gr.Blocks():
50
+ with gr.Box():
51
+ gr.Markdown(value="""**参数设置**""")
52
+ with gr.Row():
53
+ auto = gr.Checkbox(label="启用自动变调", value=False)
54
+ with gr.Row():
55
+ acc_vaule = gr.Slider(1, 50, value=20, interactive=True, label="加速倍率")
56
+ with gr.Row():
57
+ pitch_vaule = gr.Slider(-96, 96, value=0, interactive=True, label="变调(半音)")
58
+ with gr.Row():
59
+ with gr.Column(scale=1):
60
+ infer_md = gr.Button("转换音频", variant="primary")
61
+ with gr.Blocks():
62
+ with gr.Box():
63
+ gr.Markdown(value="""**输出日志**""")
64
+ infer_msg = gr.Textbox(label="日志")
65
+ infer_md.click(infer, [upload_input, pitch_vaule, acc_vaule, auto], [infer_msg, out_audio])
66
+ with gr.TabItem("说明"):
67
+ gr.Markdown(value="""
68
+ 自改cpu推理版,无音频长度限制,无降噪功能,请确保输入音频的质量\n
69
+ 有本地cpu推理的需求可以下载全部文件\n
70
+ 原项目地址:https://github.com/openvpi/diff-svc\n
71
+ 代码修改:@ChrisPreston\n
72
+ 模型训练:@ChrisPreston\n
73
+ 音源:Aqua Ch. 湊あくあ https://www.youtube.com/@MinatoAqua カバー株式会社\n
74
+ 模型使用协议(重要):\n
75
+ 1.请勿用于商业目的\n
76
+ 2.请勿用于会影响主播本人的行为(比如冒充本人发表争议言论)\n
77
+ 3.请勿用于血腥、暴力、性相关、政治相关内容\n
78
+ 4.不允许二次分发模型\n
79
+ 5.非个人使用场合请注明模型作者@ChrisPreston以及diff-svc原项目\n
80
+ 6.允许用于个人娱乐场景下的游戏语音、直播活动,不得用于低创内容,用于直播前请与本人联系\n
81
+ 联系方式:电邮:kameiliduo0825@gmail.com, b站:https://space.bilibili.com/18801308\n
82
+ 免责声明:由于使用本模型造成的法律纠纷本人概不负责
83
+ """)
84
+
85
+ app.launch(share=False)
86
+
aqua/clean_model_ckpt_steps_100000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14d1e9bf1dde30fcb397ebf91e61e77fc34cf22f6d1d6fd112eba57113a75795
3
+ size 227124201
aqua/config.yaml ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ K_step: 1000
2
+ accumulate_grad_batches: 1
3
+ audio_num_mel_bins: 128
4
+ audio_sample_rate: 44100
5
+ binarization_args:
6
+ shuffle: false
7
+ with_align: true
8
+ with_f0: true
9
+ with_hubert: true
10
+ with_spk_embed: false
11
+ with_wav: false
12
+ binarizer_cls: preprocessing.SVCpre.SVCBinarizer
13
+ binary_data_dir: data/binary/aquapre
14
+ check_val_every_n_epoch: 10
15
+ choose_test_manually: false
16
+ clip_grad_norm: 1
17
+ config_path: F:\diff-svc-main\training\config_nsf.yaml
18
+ content_cond_steps: []
19
+ cwt_add_f0_loss: false
20
+ cwt_hidden_size: 128
21
+ cwt_layers: 2
22
+ cwt_loss: l1
23
+ cwt_std_scale: 0.8
24
+ datasets:
25
+ - opencpop
26
+ debug: false
27
+ dec_ffn_kernel_size: 9
28
+ dec_layers: 4
29
+ decay_steps: 20000
30
+ decoder_type: fft
31
+ dict_dir: ''
32
+ diff_decoder_type: wavenet
33
+ diff_loss_type: l2
34
+ dilation_cycle_length: 4
35
+ dropout: 0.1
36
+ ds_workers: 4
37
+ dur_enc_hidden_stride_kernel:
38
+ - 0,2,3
39
+ - 0,2,3
40
+ - 0,1,3
41
+ dur_loss: mse
42
+ dur_predictor_kernel: 3
43
+ dur_predictor_layers: 5
44
+ enc_ffn_kernel_size: 9
45
+ enc_layers: 4
46
+ encoder_K: 8
47
+ encoder_type: fft
48
+ endless_ds: false
49
+ f0_bin: 256
50
+ f0_max: 1100.0
51
+ f0_min: 40.0
52
+ f0_static: '{"28.0": 0.07, "29.0": 0.03, "31.0": 0.05, "32.0": 0.08, "33.0": 0.12,
53
+ "34.0": 0.02, "35.0": 0.06, "36.0": 0.02, "37.0": 0.01, "38.0": 0.1, "39.0": 0.05,
54
+ "40.0": 0.09, "41.0": 0.14, "42.0": 0.16, "43.0": 0.03, "44.0": 0.42, "45.0": 0.74,
55
+ "46.0": 1.13, "47.0": 1.49, "48.0": 1.76, "49.0": 2.59, "50.0": 3.03, "51.0": 2.71,
56
+ "52.0": 1.93, "53.0": 1.11, "54.0": 0.78, "55.0": 3.33, "56.0": 20.38, "57.0": 69.6,
57
+ "58.0": 167.04, "59.0": 245.1, "60.0": 318.87, "61.0": 373.41, "62.0": 434.86, "63.0":
58
+ 415.63, "64.0": 448.97, "65.0": 452.99, "66.0": 474.88, "67.0": 471.54, "68.0":
59
+ 455.78, "69.0": 421.71, "70.0": 372.06, "71.0": 323.85, "72.0": 292.8, "73.0": 238.94,
60
+ "74.0": 190.5, "75.0": 132.86, "76.0": 88.03, "77.0": 53.16, "78.0": 32.96, "79.0":
61
+ 23.66, "80.0": 14.74, "81.0": 8.54, "82.0": 5.0, "83.0": 3.32, "84.0": 2.29, "85.0":
62
+ 0.91, "total_time": 6576.43}'
63
+ ffn_act: gelu
64
+ ffn_padding: SAME
65
+ fft_size: 2048
66
+ fmax: 16000
67
+ fmin: 40
68
+ fs2_ckpt: ''
69
+ gaussian_start: true
70
+ gen_dir_name: ''
71
+ gen_tgt_spk_id: -1
72
+ hidden_size: 256
73
+ hop_size: 512
74
+ hubert_gpu: true
75
+ hubert_path: checkpoints/hubert/hubert_soft.pt
76
+ infer: false
77
+ keep_bins: 128
78
+ lambda_commit: 0.25
79
+ lambda_energy: 0.0
80
+ lambda_f0: 1.0
81
+ lambda_ph_dur: 0.3
82
+ lambda_sent_dur: 1.0
83
+ lambda_uv: 1.0
84
+ lambda_word_dur: 1.0
85
+ load_ckpt: ''
86
+ log_interval: 100
87
+ loud_norm: false
88
+ lr: 0.0008
89
+ max_beta: 0.02
90
+ max_epochs: 3000
91
+ max_eval_sentences: 1
92
+ max_eval_tokens: 60000
93
+ max_frames: 42000
94
+ max_input_tokens: 6000
95
+ max_sentences: 88
96
+ max_tokens: 128000
97
+ max_updates: 1000000
98
+ mel_loss: ssim:0.5|l1:0.5
99
+ mel_vmax: 1.5
100
+ mel_vmin: -6.0
101
+ min_level_db: -120
102
+ no_fs2: true
103
+ norm_type: gn
104
+ num_ckpt_keep: 10
105
+ num_heads: 2
106
+ num_sanity_val_steps: 1
107
+ num_spk: 1
108
+ num_test_samples: 0
109
+ num_valid_plots: 10
110
+ optimizer_adam_beta1: 0.9
111
+ optimizer_adam_beta2: 0.98
112
+ out_wav_norm: false
113
+ pe_ckpt: checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
114
+ pe_enable: false
115
+ perform_enhance: true
116
+ pitch_ar: false
117
+ pitch_enc_hidden_stride_kernel:
118
+ - 0,2,5
119
+ - 0,2,5
120
+ - 0,2,5
121
+ pitch_extractor: parselmouth
122
+ pitch_loss: l2
123
+ pitch_norm: log
124
+ pitch_type: frame
125
+ pndm_speedup: 10
126
+ pre_align_args:
127
+ allow_no_txt: false
128
+ denoise: false
129
+ forced_align: mfa
130
+ txt_processor: zh_g2pM
131
+ use_sox: true
132
+ use_tone: false
133
+ pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
134
+ predictor_dropout: 0.5
135
+ predictor_grad: 0.1
136
+ predictor_hidden: -1
137
+ predictor_kernel: 5
138
+ predictor_layers: 5
139
+ prenet_dropout: 0.5
140
+ prenet_hidden_size: 256
141
+ pretrain_fs_ckpt: ''
142
+ processed_data_dir: xxx
143
+ profile_infer: false
144
+ raw_data_dir: data/raw/aquapre
145
+ ref_norm_layer: bn
146
+ rel_pos: true
147
+ reset_phone_dict: true
148
+ residual_channels: 512
149
+ residual_layers: 20
150
+ save_best: false
151
+ save_ckpt: true
152
+ save_codes:
153
+ - configs
154
+ - modules
155
+ - src
156
+ - utils
157
+ save_f0: true
158
+ save_gt: false
159
+ schedule_type: linear
160
+ seed: 1234
161
+ sort_by_len: true
162
+ speaker_id: aqua
163
+ spec_max:
164
+ - 0.18377557396888733
165
+ - -0.33469653129577637
166
+ - -0.3073468506336212
167
+ - -0.21027648448944092
168
+ - 0.23178215324878693
169
+ - 0.5297451019287109
170
+ - 0.7021887898445129
171
+ - 0.7711099982261658
172
+ - 0.7912386059761047
173
+ - 0.6609739065170288
174
+ - 0.649876058101654
175
+ - 0.6327046751976013
176
+ - 0.6892049908638
177
+ - 0.6026111841201782
178
+ - 0.6834777593612671
179
+ - 0.7417489886283875
180
+ - 0.6040375828742981
181
+ - 0.5854794383049011
182
+ - 0.7123280167579651
183
+ - 0.5886657238006592
184
+ - 0.6135984063148499
185
+ - 0.5388530492782593
186
+ - 0.5932422280311584
187
+ - 0.535581111907959
188
+ - 0.57913738489151
189
+ - 0.6827316880226135
190
+ - 0.6265526413917542
191
+ - 0.6557696461677551
192
+ - 0.6586976647377014
193
+ - 0.5687282085418701
194
+ - 0.6218562722206116
195
+ - 0.6349128484725952
196
+ - 0.6176865100860596
197
+ - 0.6212958097457886
198
+ - 0.6277656555175781
199
+ - 0.5551338195800781
200
+ - 0.6126622557640076
201
+ - 0.5821346640586853
202
+ - 0.577056348323822
203
+ - 0.5649800300598145
204
+ - 0.5984634757041931
205
+ - 0.4873456656932831
206
+ - 0.47209471464157104
207
+ - 0.4387756586074829
208
+ - 0.4690910577774048
209
+ - 0.4616055190563202
210
+ - 0.3555675446987152
211
+ - 0.3898852467536926
212
+ - 0.3676068186759949
213
+ - 0.4632047414779663
214
+ - 0.37983986735343933
215
+ - 0.3877682685852051
216
+ - 0.3099276125431061
217
+ - 0.3261813223361969
218
+ - 0.34168118238449097
219
+ - 0.3004901111125946
220
+ - 0.3512653112411499
221
+ - 0.2647061347961426
222
+ - 0.2685043215751648
223
+ - 0.20390087366104126
224
+ - 0.1825377196073532
225
+ - 0.22067485749721527
226
+ - 0.20306138694286346
227
+ - 0.12710601091384888
228
+ - 0.10927848517894745
229
+ - 0.1117628887295723
230
+ - 0.14148156344890594
231
+ - 0.122605100274086
232
+ - 0.08032718300819397
233
+ - 0.12159623205661774
234
+ - -0.04923255369067192
235
+ - -0.07824847847223282
236
+ - 0.03441360592842102
237
+ - 0.07093964517116547
238
+ - -0.1269683688879013
239
+ - 0.0027632638812065125
240
+ - -0.045093610882759094
241
+ - -0.04115259647369385
242
+ - 0.029067598283290863
243
+ - -0.009453626349568367
244
+ - -0.0470033697783947
245
+ - -0.04894810542464256
246
+ - -0.06236470118165016
247
+ - -0.20086997747421265
248
+ - -0.2363593578338623
249
+ - -0.17289961874485016
250
+ - -0.219277486205101
251
+ - -0.2934815585613251
252
+ - -0.30551621317863464
253
+ - -0.2513120770454407
254
+ - -0.26792851090431213
255
+ - -0.33068278431892395
256
+ - -0.37532031536102295
257
+ - -0.365634560585022
258
+ - -0.3379015326499939
259
+ - -0.26979681849479675
260
+ - -0.20316314697265625
261
+ - -0.2109878957271576
262
+ - -0.16927000880241394
263
+ - -0.1698305308818817
264
+ - -0.2739156186580658
265
+ - -0.2700604200363159
266
+ - -0.32284122705459595
267
+ - -0.44529229402542114
268
+ - -0.4002469480037689
269
+ - -0.2441970407962799
270
+ - -0.19795942306518555
271
+ - -0.2462945580482483
272
+ - -0.0673084482550621
273
+ - -0.22117790579795837
274
+ - -0.21418607234954834
275
+ - -0.39467209577560425
276
+ - -0.4388139843940735
277
+ - -0.3227368891239166
278
+ - -0.30530503392219543
279
+ - -0.3201104998588562
280
+ - -0.39839836955070496
281
+ - -0.464596688747406
282
+ - -0.5399728417396545
283
+ - -0.5515261292457581
284
+ - -0.520453691482544
285
+ - -0.6714966893196106
286
+ - -0.6414765119552612
287
+ - -0.6108742356300354
288
+ - -0.6762520670890808
289
+ - -0.7067146301269531
290
+ - -0.7586700320243835
291
+ - -0.6640384793281555
292
+ spec_min:
293
+ - -4.999994277954102
294
+ - -4.999994277954102
295
+ - -4.999994277954102
296
+ - -4.999994277954102
297
+ - -4.999994277954102
298
+ - -4.999994277954102
299
+ - -4.999994277954102
300
+ - -4.999994277954102
301
+ - -4.999994277954102
302
+ - -4.999994277954102
303
+ - -4.999994277954102
304
+ - -4.999994277954102
305
+ - -4.999994277954102
306
+ - -4.999994277954102
307
+ - -4.999994277954102
308
+ - -4.999994277954102
309
+ - -4.999994277954102
310
+ - -4.999994277954102
311
+ - -4.999994277954102
312
+ - -4.999994277954102
313
+ - -4.999994277954102
314
+ - -4.999994277954102
315
+ - -4.999994277954102
316
+ - -4.999994277954102
317
+ - -4.999994277954102
318
+ - -4.999994277954102
319
+ - -4.999994277954102
320
+ - -4.999994277954102
321
+ - -4.999994277954102
322
+ - -4.999994277954102
323
+ - -4.999994277954102
324
+ - -4.999994277954102
325
+ - -4.999994277954102
326
+ - -4.999994277954102
327
+ - -4.999994277954102
328
+ - -4.999994277954102
329
+ - -4.999994277954102
330
+ - -4.999994277954102
331
+ - -4.999994277954102
332
+ - -4.999994277954102
333
+ - -4.999994277954102
334
+ - -4.999994277954102
335
+ - -4.999994277954102
336
+ - -4.999994277954102
337
+ - -4.999994277954102
338
+ - -4.999994277954102
339
+ - -4.999994277954102
340
+ - -4.999994277954102
341
+ - -4.999994277954102
342
+ - -4.999994277954102
343
+ - -4.999994277954102
344
+ - -4.999994277954102
345
+ - -4.999994277954102
346
+ - -4.999994277954102
347
+ - -4.999994277954102
348
+ - -4.999994277954102
349
+ - -4.999994277954102
350
+ - -4.999994277954102
351
+ - -4.999994277954102
352
+ - -4.999994277954102
353
+ - -4.999994277954102
354
+ - -4.999994277954102
355
+ - -4.999994277954102
356
+ - -4.999994277954102
357
+ - -4.999994277954102
358
+ - -4.999994277954102
359
+ - -4.999994277954102
360
+ - -4.999994277954102
361
+ - -4.999994277954102
362
+ - -4.999994277954102
363
+ - -4.999994277954102
364
+ - -4.999994277954102
365
+ - -4.999994277954102
366
+ - -4.999994277954102
367
+ - -4.999994277954102
368
+ - -4.999994277954102
369
+ - -4.999994277954102
370
+ - -4.999994277954102
371
+ - -4.999994277954102
372
+ - -4.999994277954102
373
+ - -4.999994277954102
374
+ - -4.999994277954102
375
+ - -4.999994277954102
376
+ - -4.999994277954102
377
+ - -4.999994277954102
378
+ - -4.999994277954102
379
+ - -4.999994277954102
380
+ - -4.999994277954102
381
+ - -4.999994277954102
382
+ - -4.999994277954102
383
+ - -4.999994277954102
384
+ - -4.999994277954102
385
+ - -4.999994277954102
386
+ - -4.999994277954102
387
+ - -4.999994277954102
388
+ - -4.999994277954102
389
+ - -4.999994277954102
390
+ - -4.999994277954102
391
+ - -4.999994277954102
392
+ - -4.999994277954102
393
+ - -4.999994277954102
394
+ - -4.999994277954102
395
+ - -4.999994277954102
396
+ - -4.999994277954102
397
+ - -4.999994277954102
398
+ - -4.999994277954102
399
+ - -4.999994277954102
400
+ - -4.999994277954102
401
+ - -4.999994277954102
402
+ - -4.999994277954102
403
+ - -4.999994277954102
404
+ - -4.999994277954102
405
+ - -4.999994277954102
406
+ - -4.999994277954102
407
+ - -4.999994277954102
408
+ - -4.999994277954102
409
+ - -4.999994277954102
410
+ - -4.999994277954102
411
+ - -4.999994277954102
412
+ - -4.999994277954102
413
+ - -4.999994277954102
414
+ - -4.999994277954102
415
+ - -4.999994277954102
416
+ - -4.999994277954102
417
+ - -4.999994277954102
418
+ - -4.999994277954102
419
+ - -4.989471912384033
420
+ - -4.999994277954102
421
+ spk_cond_steps: []
422
+ stop_token_weight: 5.0
423
+ task_cls: training.task.SVC_task.SVCTask
424
+ test_ids: []
425
+ test_input_dir: ''
426
+ test_num: 0
427
+ test_prefixes:
428
+ - test
429
+ test_set_name: test
430
+ timesteps: 1000
431
+ train_set_name: train
432
+ use_cn_hubert: false
433
+ use_crepe: true
434
+ use_denoise: false
435
+ use_energy_embed: false
436
+ use_gt_dur: false
437
+ use_gt_f0: false
438
+ use_midi: false
439
+ use_nsf: true
440
+ use_pitch_embed: true
441
+ use_pos_embed: true
442
+ use_spk_embed: false
443
+ use_spk_id: false
444
+ use_split_spk_id: false
445
+ use_uv: false
446
+ use_var_enc: false
447
+ use_vec: false
448
+ val_check_interval: 2000
449
+ valid_num: 0
450
+ valid_set_name: valid
451
+ vocoder: network.vocoders.nsf_hifigan.NsfHifiGAN
452
+ vocoder_ckpt: checkpoints/nsf_hifigan/model
453
+ warmup_updates: 2000
454
+ wav2spec_eps: 1e-6
455
+ weight_decay: 0
456
+ win_size: 2048
457
+ work_dir: checkpoints/aquapre
checkpoints/0102_xiaoma_pe/config.yaml ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ audio_num_mel_bins: 80
3
+ audio_sample_rate: 24000
4
+ base_config:
5
+ - configs/tts/lj/fs2.yaml
6
+ binarization_args:
7
+ shuffle: false
8
+ with_align: true
9
+ with_f0: true
10
+ with_f0cwt: true
11
+ with_spk_embed: true
12
+ with_txt: true
13
+ with_wav: false
14
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
15
+ binary_data_dir: data/binary/xiaoma1022_24k_128hop
16
+ check_val_every_n_epoch: 10
17
+ clip_grad_norm: 1
18
+ cwt_add_f0_loss: false
19
+ cwt_hidden_size: 128
20
+ cwt_layers: 2
21
+ cwt_loss: l1
22
+ cwt_std_scale: 0.8
23
+ debug: false
24
+ dec_ffn_kernel_size: 9
25
+ dec_layers: 4
26
+ decoder_type: fft
27
+ dict_dir: ''
28
+ dropout: 0.1
29
+ ds_workers: 4
30
+ dur_enc_hidden_stride_kernel:
31
+ - 0,2,3
32
+ - 0,2,3
33
+ - 0,1,3
34
+ dur_loss: mse
35
+ dur_predictor_kernel: 3
36
+ dur_predictor_layers: 2
37
+ enc_ffn_kernel_size: 9
38
+ enc_layers: 4
39
+ encoder_K: 8
40
+ encoder_type: fft
41
+ endless_ds: true
42
+ ffn_act: gelu
43
+ ffn_padding: SAME
44
+ fft_size: 512
45
+ fmax: 12000
46
+ fmin: 30
47
+ gen_dir_name: ''
48
+ hidden_size: 256
49
+ hop_size: 128
50
+ infer: false
51
+ lambda_commit: 0.25
52
+ lambda_energy: 0.1
53
+ lambda_f0: 1.0
54
+ lambda_ph_dur: 1.0
55
+ lambda_sent_dur: 1.0
56
+ lambda_uv: 1.0
57
+ lambda_word_dur: 1.0
58
+ load_ckpt: ''
59
+ log_interval: 100
60
+ loud_norm: false
61
+ lr: 2.0
62
+ max_epochs: 1000
63
+ max_eval_sentences: 1
64
+ max_eval_tokens: 60000
65
+ max_frames: 5000
66
+ max_input_tokens: 1550
67
+ max_sentences: 100000
68
+ max_tokens: 20000
69
+ max_updates: 60000
70
+ mel_loss: l1
71
+ mel_vmax: 1.5
72
+ mel_vmin: -6
73
+ min_level_db: -120
74
+ norm_type: gn
75
+ num_ckpt_keep: 3
76
+ num_heads: 2
77
+ num_sanity_val_steps: 5
78
+ num_spk: 1
79
+ num_test_samples: 20
80
+ num_valid_plots: 10
81
+ optimizer_adam_beta1: 0.9
82
+ optimizer_adam_beta2: 0.98
83
+ out_wav_norm: false
84
+ pitch_ar: false
85
+ pitch_enc_hidden_stride_kernel:
86
+ - 0,2,5
87
+ - 0,2,5
88
+ - 0,2,5
89
+ pitch_extractor_conv_layers: 2
90
+ pitch_loss: l1
91
+ pitch_norm: log
92
+ pitch_type: frame
93
+ pre_align_args:
94
+ allow_no_txt: false
95
+ denoise: false
96
+ forced_align: mfa
97
+ txt_processor: en
98
+ use_sox: false
99
+ use_tone: true
100
+ pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
101
+ predictor_dropout: 0.5
102
+ predictor_grad: 0.1
103
+ predictor_hidden: -1
104
+ predictor_kernel: 5
105
+ predictor_layers: 2
106
+ prenet_dropout: 0.5
107
+ prenet_hidden_size: 256
108
+ pretrain_fs_ckpt: ''
109
+ processed_data_dir: data/processed/ljspeech
110
+ profile_infer: false
111
+ raw_data_dir: data/raw/LJSpeech-1.1
112
+ ref_norm_layer: bn
113
+ reset_phone_dict: true
114
+ save_best: false
115
+ save_ckpt: true
116
+ save_codes:
117
+ - configs
118
+ - modules
119
+ - tasks
120
+ - utils
121
+ - usr
122
+ save_f0: false
123
+ save_gt: false
124
+ seed: 1234
125
+ sort_by_len: true
126
+ stop_token_weight: 5.0
127
+ task_cls: tasks.tts.pe.PitchExtractionTask
128
+ test_ids:
129
+ - 68
130
+ - 70
131
+ - 74
132
+ - 87
133
+ - 110
134
+ - 172
135
+ - 190
136
+ - 215
137
+ - 231
138
+ - 294
139
+ - 316
140
+ - 324
141
+ - 402
142
+ - 422
143
+ - 485
144
+ - 500
145
+ - 505
146
+ - 508
147
+ - 509
148
+ - 519
149
+ test_input_dir: ''
150
+ test_num: 523
151
+ test_set_name: test
152
+ train_set_name: train
153
+ use_denoise: false
154
+ use_energy_embed: false
155
+ use_gt_dur: false
156
+ use_gt_f0: false
157
+ use_pitch_embed: true
158
+ use_pos_embed: true
159
+ use_spk_embed: false
160
+ use_spk_id: false
161
+ use_split_spk_id: false
162
+ use_uv: true
163
+ use_var_enc: false
164
+ val_check_interval: 2000
165
+ valid_num: 348
166
+ valid_set_name: valid
167
+ vocoder: pwg
168
+ vocoder_ckpt: ''
169
+ warmup_updates: 2000
170
+ weight_decay: 0
171
+ win_size: 512
172
+ work_dir: checkpoints/0102_xiaoma_pe
checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1863f12324e43783089ab933edeeb969106b851e30d71019ebbaa9b82099d82a
3
+ size 39141959
checkpoints/hubert/hubert.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c72bad89da99152077bf8157ff75beca7c6dc966ea01a6a0fb3777f99e77aa9b
3
+ size 378353321
checkpoints/hubert/hubert_soft.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e82e7d079df05fe3aa535f6f7d42d309bdae1d2a53324e2b2386c56721f4f649
3
+ size 378435957
checkpoints/nsf_hifigan/NOTICE.txt ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- DiffSinger Community Vocoder ---
2
+
3
+ ARCHITECTURE: NSF-HiFiGAN
4
+ RELEASE DATE: 2022-12-11
5
+
6
+ HYPER PARAMETERS:
7
+ - 44100 sample rate
8
+ - 128 mel bins
9
+ - 512 hop size
10
+ - 2048 window size
11
+ - fmin at 40Hz
12
+ - fmax at 16000Hz
13
+
14
+
15
+ NOTICE:
16
+
17
+ All model weights in the [DiffSinger Community Vocoder Project](https://openvpi.github.io/vocoders/), including
18
+ model weights in this directory, are provided by the [OpenVPI Team](https://github.com/openvpi/), under the
19
+ [Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/) license.
20
+
21
+
22
+ ACKNOWLEDGEMENTS:
23
+
24
+ Training data of this vocoder is provided and permitted by the following organizations, societies and individuals:
25
+
26
+ 孙飒 https://www.qfssr.cn
27
+ 赤松_Akamatsu https://www.zhibin.club
28
+ 乐威 https://www.zhibin.club
29
+ 伯添 https://space.bilibili.com/24087011
30
+ 雲宇光 https://space.bilibili.com/660675050
31
+ 橙子言 https://space.bilibili.com/318486464
32
+ 人衣大人 https://space.bilibili.com/2270344
33
+ 玖蝶 https://space.bilibili.com/676771003
34
+ Yuuko
35
+ 白夜零BYL https://space.bilibili.com/1605040503
36
+ 嗷天 https://space.bilibili.com/5675252
37
+ 洛泠羽 https://space.bilibili.com/347373318
38
+ 灰条纹的灰猫君 https://space.bilibili.com/2083633
39
+ 幽寂 https://space.bilibili.com/478860
40
+ 恶魔王女 https://space.bilibili.com/2475098
41
+ AlexYHX 芮晴
42
+ 绮萱 https://y.qq.com/n/ryqq/singer/003HjD6H4aZn1K
43
+ 诗芸 https://y.qq.com/n/ryqq/singer/0005NInj142zm0
44
+ 汐蕾 https://y.qq.com/n/ryqq/singer/0023cWMH1Bq1PJ
45
+ 1262917464
46
+ 炜阳
47
+ 叶卡yolka
48
+ 幸の夏 https://space.bilibili.com/1017297686
49
+ 暮色未量 https://space.bilibili.com/272904686
50
+ 晓寞sama https://space.bilibili.com/3463394
51
+ 没头绪的节操君
52
+ 串串BunC https://space.bilibili.com/95817834
53
+ 落雨 https://space.bilibili.com/1292427
54
+ 长尾巴的翎艾 https://space.bilibili.com/1638666
55
+ 声闻计划 https://space.bilibili.com/392812269
56
+ 唐家大小姐 http://5sing.kugou.com/palmusic/default.html
57
+ 不伊子
58
+
59
+ Training machines are provided by:
60
+
61
+ 花儿不哭 https://space.bilibili.com/5760446
62
+
63
+
64
+ TERMS OF REDISTRIBUTIONS:
65
+
66
+ 1. Do not sell this vocoder, or charge any fees from redistributing it, as prohibited by
67
+ the license.
68
+ 2. Include a copy of the CC BY-NC-SA 4.0 license, or a link referring to it.
69
+ 3. Include a copy of this notice, or any other notices informing that this vocoder is
70
+ provided by the OpenVPI Team, that this vocoder is licensed under CC BY-NC-SA 4.0, and
71
+ with a complete acknowledgement list as shown above.
72
+ 4. If you fine-tuned or modified the weights, leave a notice about what has been changed.
73
+ 5. (Optional) Leave a link to the official release page of the vocoder, and tell users
74
+ that other versions and future updates of this vocoder can be obtained from the website.
checkpoints/nsf_hifigan/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 4,
4
+ "batch_size": 10,
5
+ "learning_rate": 0.0002,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.999,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [ 8, 8, 2, 2, 2],
12
+ "upsample_kernel_sizes": [16,16, 4, 4, 4],
13
+ "upsample_initial_channel": 512,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+ "discriminator_periods": [3, 5, 7, 11, 17, 23, 37],
17
+
18
+ "segment_size": 16384,
19
+ "num_mels": 128,
20
+ "num_freq": 1025,
21
+ "n_fft" : 2048,
22
+ "hop_size": 512,
23
+ "win_size": 2048,
24
+
25
+ "sampling_rate": 44100,
26
+
27
+ "fmin": 40,
28
+ "fmax": 16000,
29
+ "fmax_for_loss": null,
30
+
31
+ "num_workers": 16,
32
+
33
+ "dist_config": {
34
+ "dist_backend": "nccl",
35
+ "dist_url": "tcp://localhost:54321",
36
+ "world_size": 1
37
+ }
38
+ }
checkpoints/nsf_hifigan/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c576b63b7ed952161b70fad34e0562ace502ce689195520d8a2a6c051de29d6
3
+ size 56825430
infer.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from pathlib import Path
3
+
4
+ import numpy as np
5
+ import soundfile
6
+
7
+ from infer_tools import infer_tool
8
+ from infer_tools import slicer
9
+ from infer_tools.infer_tool import Svc
10
+ from utils.hparams import hparams
11
+
12
+
13
+ def run_clip(raw_audio_path, svc_model, key, acc, use_crepe, spk_id=0, auto_key=False, out_path=None, slice_db=-40,
14
+ **kwargs):
15
+ print(f'code version:2023-01-22')
16
+
17
+ clean_name = Path(raw_audio_path).name.split(".")[0]
18
+ infer_tool.format_wav(raw_audio_path)
19
+ wav_path = Path(raw_audio_path).with_suffix('.wav')
20
+ key = svc_model.evaluate_key(wav_path, key, auto_key)
21
+ chunks = slicer.cut(wav_path, db_thresh=slice_db)
22
+ audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
23
+
24
+ count = 0
25
+ f0_tst, f0_pred, audio = [], [], []
26
+ for (slice_tag, data) in audio_data:
27
+ print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
28
+ length = int(np.ceil(len(data) / audio_sr * hparams['audio_sample_rate']))
29
+ raw_path = io.BytesIO()
30
+ soundfile.write(raw_path, data, audio_sr, format="wav")
31
+ raw_path.seek(0)
32
+ if slice_tag:
33
+ print('jump empty segment')
34
+ _f0_tst, _f0_pred, _audio = (
35
+ np.zeros(int(np.ceil(length / hparams['hop_size']))),
36
+ np.zeros(int(np.ceil(length / hparams['hop_size']))),
37
+ np.zeros(length))
38
+ else:
39
+ _f0_tst, _f0_pred, _audio = svc_model.infer(raw_path, spk_id=spk_id, key=key, acc=acc, use_crepe=use_crepe)
40
+ fix_audio = np.zeros(length)
41
+ fix_audio[:] = np.mean(_audio)
42
+ fix_audio[:len(_audio)] = _audio[0 if len(_audio) < len(fix_audio) else len(_audio) - len(fix_audio):]
43
+ f0_tst.extend(_f0_tst)
44
+ f0_pred.extend(_f0_pred)
45
+ audio.extend(list(fix_audio))
46
+ count += 1
47
+ if out_path is None:
48
+ out_path = f'./results/{clean_name}_{key}key_{project_name}_{hparams["residual_channels"]}_{hparams["residual_layers"]}_{int(step / 1000)}k_{accelerate}x.{kwargs["format"]}'
49
+ soundfile.write(out_path, audio, hparams["audio_sample_rate"], 'PCM_16', format=out_path.split('.')[-1])
50
+ return np.array(f0_tst), np.array(f0_pred), audio
51
+
52
+
53
+ if __name__ == '__main__':
54
+ # 工程文件夹名,训练时用的那个
55
+ project_name = "open-aqua"
56
+ model_path = f'./checkpoints/{project_name}/model_ckpt_steps_90000.ckpt'
57
+ config_path = f'./checkpoints/{project_name}/config.yaml'
58
+
59
+ # 支持多个wav/ogg文件,放在raw文件夹下,带扩展名
60
+ file_names = ["横竖撇点折-main-2key.wav"]
61
+ spk_id = "single"
62
+ # 自适应变调(仅支持单人模型)
63
+ auto_key = False
64
+ trans = [0] # 音高调整,支持正负(半音),数量与上一行对应,不足的自动按第一个移调参数补齐
65
+ # 加速倍数
66
+ accelerate = 1
67
+ hubert_gpu = True
68
+ wav_format = 'wav'
69
+ step = int(model_path.split("_")[-1].split(".")[0])
70
+
71
+ # 下面不动
72
+ infer_tool.mkdir(["./raw", "./results"])
73
+ infer_tool.fill_a_to_b(trans, file_names)
74
+
75
+ model = Svc(project_name, config_path, hubert_gpu, model_path, onnx=False)
76
+ for f_name, tran in zip(file_names, trans):
77
+ if "." not in f_name:
78
+ f_name += ".wav"
79
+ audio_path = f"./raw/{f_name}"
80
+ run_clip(raw_audio_path=audio_path, svc_model=model, key=tran, acc=accelerate, use_crepe=False,
81
+ spk_id=spk_id, auto_key=auto_key, project_name=project_name, format=wav_format)
infer_tools/__pycache__/f0_static.cpython-38.pyc ADDED
Binary file (5.12 kB). View file
 
infer_tools/__pycache__/infer_tool.cpython-38.pyc ADDED
Binary file (7.26 kB). View file
 
infer_tools/__pycache__/infer_tool_beta.cpython-38.pyc ADDED
Binary file (7.8 kB). View file
 
infer_tools/__pycache__/slicer.cpython-38.pyc ADDED
Binary file (3.84 kB). View file
 
infer_tools/__pycache__/trans_key.cpython-38.pyc ADDED
Binary file (2 kB). View file
 
infer_tools/f0_static.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import shutil
4
+ from functools import reduce
5
+ from pathlib import Path
6
+
7
+ import matplotlib
8
+ import matplotlib.pyplot as plt
9
+ import yaml
10
+ from pylab import xticks, np
11
+ from tqdm import tqdm
12
+
13
+ from modules.vocoders.nsf_hifigan import NsfHifiGAN
14
+ from preprocessing.process_pipeline import get_pitch_parselmouth, get_pitch_crepe
15
+ from utils.hparams import set_hparams, hparams
16
+
17
+ head_list = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
18
+
19
+
20
+ def compare_pitch(f0_static_dict, pitch_time_temp, trans_key=0):
21
+ return sum({k: v * f0_static_dict[str(k + trans_key)] for k, v in pitch_time_temp.items() if
22
+ str(k + trans_key) in f0_static_dict}.values())
23
+
24
+
25
+ def f0_to_pitch(ff):
26
+ f0_pitch = 69 + 12 * np.log2(ff / 440)
27
+ return round(f0_pitch, 0)
28
+
29
+
30
+ def pitch_to_name(pitch):
31
+ return f"{head_list[int(pitch % 12)]}{int(pitch / 12) - 1}"
32
+
33
+
34
+ def get_f0(audio_path, crepe=False):
35
+ wav, mel = NsfHifiGAN.wav2spec(audio_path)
36
+ if crepe:
37
+ f0, pitch_coarse = get_pitch_crepe(wav, mel, hparams)
38
+ else:
39
+ f0, pitch_coarse = get_pitch_parselmouth(wav, mel, hparams)
40
+ return f0
41
+
42
+
43
+ def merge_f0_dict(dict_list):
44
+ def sum_dict(a, b):
45
+ temp = dict()
46
+ for key in a.keys() | b.keys():
47
+ temp[key] = sum([d.get(key, 0) for d in (a, b)])
48
+ return temp
49
+
50
+ return reduce(sum_dict, dict_list)
51
+
52
+
53
+ def collect_f0(f0):
54
+ pitch_num = {}
55
+ pitch_list = [f0_to_pitch(x) for x in f0[f0 > 0]]
56
+ for key in pitch_list:
57
+ pitch_num[key] = pitch_num.get(key, 0) + 1
58
+ return pitch_num
59
+
60
+
61
+ def static_f0_time(f0):
62
+ if isinstance(f0, dict):
63
+ pitch_num = merge_f0_dict({k: collect_f0(v) for k, v in f0.items()}.values())
64
+ else:
65
+ pitch_num = collect_f0(f0)
66
+ static_pitch_time = {}
67
+ sort_key = sorted(pitch_num.keys())
68
+ for key in sort_key:
69
+ static_pitch_time[key] = round(pitch_num[key] * hparams['hop_size'] / hparams['audio_sample_rate'], 2)
70
+ return static_pitch_time
71
+
72
+
73
+ def get_end_file(dir_path, end):
74
+ file_lists = []
75
+ for root, dirs, files in os.walk(dir_path):
76
+ files = [f for f in files if f[0] != '.']
77
+ dirs[:] = [d for d in dirs if d[0] != '.']
78
+ for f_file in files:
79
+ if f_file.endswith(end):
80
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
81
+ return file_lists
82
+
83
+
84
+ if __name__ == "__main__":
85
+ # 给config文件增加f0_static统计音域
86
+ config_path = "F:/sovits/diff-svc-main/checkpoints/aquapre/config.yaml"
87
+ hparams = set_hparams(config=config_path, exp_name='', infer=True, reset=True, hparams_str='', print_hparams=False)
88
+ f0_dict = {}
89
+ # 获取batch文件夹下所有wav文件
90
+ wav_paths = get_end_file("F:/sovits/diff-svc-main/batch/aquapre", "wav")
91
+ # parselmouth获取f0
92
+ with tqdm(total=len(wav_paths)) as p_bar:
93
+ p_bar.set_description('Processing')
94
+ for wav_path in wav_paths:
95
+ f0_dict[wav_path] = get_f0(wav_path, crepe=False)
96
+ p_bar.update(1)
97
+ pitch_time = static_f0_time(f0_dict)
98
+ total_time = round(sum(pitch_time.values()), 2)
99
+ pitch_time["total_time"] = total_time
100
+ print(f"total time: {total_time}s")
101
+ shutil.copy(config_path, f"{Path(config_path).parent}\\back_{Path(config_path).name}")
102
+ with open(config_path, encoding='utf-8') as f:
103
+ _hparams = yaml.safe_load(f)
104
+ _hparams['f0_static'] = json.dumps(pitch_time)
105
+ with open(config_path, 'w', encoding='utf-8') as f:
106
+ yaml.safe_dump(_hparams, f)
107
+ print("原config文件已在原目录建立备份:back_config.yaml")
108
+ print("音域统计已保存至config文件,此模型可使用自动变调功能")
109
+ matplotlib.use('TkAgg')
110
+ plt.title("数据集音域统计", fontproperties='SimHei')
111
+ plt.xlabel("音高", fontproperties='SimHei')
112
+ plt.ylabel("时长(s)", fontproperties='SimHei')
113
+ xticks_labels = [pitch_to_name(i) for i in range(36, 96)]
114
+ xticks(np.linspace(36, 96, 60, endpoint=True), xticks_labels)
115
+ plt.plot(pitch_time.keys(), pitch_time.values(), color='dodgerblue')
116
+ plt.show()
infer_tools/f0_temp.json ADDED
The diff for this file is too large to render. See raw diff
 
infer_tools/infer_tool.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+
7
+ import librosa
8
+ import numpy as np
9
+ import soundfile
10
+ import torch
11
+
12
+ import utils
13
+ from infer_tools.f0_static import compare_pitch, static_f0_time
14
+ from modules.diff.diffusion import GaussianDiffusion
15
+ from modules.diff.net import DiffNet
16
+ from modules.vocoders.nsf_hifigan import NsfHifiGAN
17
+ from preprocessing.hubertinfer import HubertEncoder
18
+ from preprocessing.process_pipeline import File2Batch, get_pitch_parselmouth
19
+ from utils.hparams import hparams, set_hparams
20
+ from utils.pitch_utils import denorm_f0, norm_interp_f0
21
+
22
+
23
+ def timeit(func):
24
+ def run(*args, **kwargs):
25
+ t = time.time()
26
+ res = func(*args, **kwargs)
27
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
28
+ return res
29
+
30
+ return run
31
+
32
+
33
+ def format_wav(audio_path):
34
+ if Path(audio_path).suffix == '.wav':
35
+ return
36
+ raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
37
+ soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
38
+
39
+
40
+ def fill_a_to_b(a, b):
41
+ if len(a) < len(b):
42
+ for _ in range(0, len(b) - len(a)):
43
+ a.append(a[0])
44
+
45
+
46
+ def get_end_file(dir_path, end):
47
+ file_lists = []
48
+ for root, dirs, files in os.walk(dir_path):
49
+ files = [f for f in files if f[0] != '.']
50
+ dirs[:] = [d for d in dirs if d[0] != '.']
51
+ for f_file in files:
52
+ if f_file.endswith(end):
53
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
54
+ return file_lists
55
+
56
+
57
+ def mkdir(paths: list):
58
+ for path in paths:
59
+ if not os.path.exists(path):
60
+ os.mkdir(path)
61
+
62
+
63
+ class Svc:
64
+ def __init__(self, project_name, config_name, hubert_gpu, model_path, onnx=False):
65
+ self.project_name = project_name
66
+ self.DIFF_DECODERS = {
67
+ 'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
68
+ }
69
+
70
+ self.model_path = model_path
71
+ self.dev = torch.device("cpu")
72
+
73
+ self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
74
+ reset=True, hparams_str='', print_hparams=False)
75
+
76
+ hparams['hubert_gpu'] = hubert_gpu
77
+ self.hubert = HubertEncoder(hparams['hubert_path'], onnx=onnx)
78
+ self.model = GaussianDiffusion(
79
+ phone_encoder=self.hubert,
80
+ out_dims=hparams['audio_num_mel_bins'],
81
+ denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
82
+ timesteps=hparams['timesteps'],
83
+ K_step=hparams['K_step'],
84
+ loss_type=hparams['diff_loss_type'],
85
+ spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
86
+ )
87
+ utils.load_ckpt(self.model, self.model_path, 'model', force=True, strict=True)
88
+ self.model.to(self.dev)
89
+ self.vocoder = NsfHifiGAN()
90
+
91
+ def infer(self, in_path, key, acc, spk_id=0, use_crepe=True, singer=False):
92
+ batch = self.pre(in_path, acc, spk_id, use_crepe)
93
+ batch['f0'] = batch['f0'] + (key / 12)
94
+ batch['f0'][batch['f0'] > np.log2(hparams['f0_max'])] = 0
95
+
96
+ @timeit
97
+ def diff_infer():
98
+ spk_embed = batch.get('spk_embed') if not hparams['use_spk_id'] else batch.get('spk_ids')
99
+ energy = batch.get('energy').cpu() if batch.get('energy') else None
100
+ if spk_embed is None:
101
+ spk_embed = torch.LongTensor([0]).cpu()
102
+ diff_outputs = self.model(
103
+ hubert=batch['hubert'].cpu(), spk_embed_id=spk_embed.cpu(), mel2ph=batch['mel2ph'].cpu(),
104
+ f0=batch['f0'].cpu(), energy=energy, ref_mels=batch["mels"].cpu(), infer=True)
105
+ return diff_outputs
106
+
107
+ outputs = diff_infer()
108
+ batch['outputs'] = outputs['mel_out']
109
+ batch['mel2ph_pred'] = outputs['mel2ph']
110
+ batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
111
+ batch['f0_pred'] = outputs.get('f0_denorm')
112
+ return self.after_infer(batch, singer, in_path)
113
+
114
+ @timeit
115
+ def after_infer(self, prediction, singer, in_path):
116
+ for k, v in prediction.items():
117
+ if type(v) is torch.Tensor:
118
+ prediction[k] = v.cpu().numpy()
119
+
120
+ # remove paddings
121
+ mel_gt = prediction["mels"]
122
+ mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
123
+
124
+ mel_pred = prediction["outputs"]
125
+ mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
126
+ mel_pred = mel_pred[mel_pred_mask]
127
+ mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])
128
+
129
+ f0_gt = prediction.get("f0_gt")
130
+ f0_pred = prediction.get("f0_pred")
131
+ if f0_pred is not None:
132
+ f0_gt = f0_gt[mel_gt_mask]
133
+ if len(f0_pred) > len(mel_pred_mask):
134
+ f0_pred = f0_pred[:len(mel_pred_mask)]
135
+ f0_pred = f0_pred[mel_pred_mask]
136
+ torch.cuda.is_available() and torch.cuda.empty_cache()
137
+
138
+ if singer:
139
+ data_path = in_path.replace("batch", "singer_data")
140
+ mel_path = data_path[:-4] + "_mel.npy"
141
+ f0_path = data_path[:-4] + "_f0.npy"
142
+ np.save(mel_path, mel_pred)
143
+ np.save(f0_path, f0_pred)
144
+ wav_pred = self.vocoder.spec2wav(mel_pred, f0=f0_pred)
145
+ return f0_gt, f0_pred, wav_pred
146
+
147
+ def pre(self, wav_fn, accelerate, spk_id=0, use_crepe=True):
148
+ if isinstance(wav_fn, BytesIO):
149
+ item_name = self.project_name
150
+ else:
151
+ song_info = wav_fn.split('/')
152
+ item_name = song_info[-1].split('.')[-2]
153
+ temp_dict = {'wav_fn': wav_fn, 'spk_id': spk_id, 'id': 0}
154
+
155
+ temp_dict = File2Batch.temporary_dict2processed_input(item_name, temp_dict, self.hubert, infer=True,
156
+ use_crepe=use_crepe)
157
+ hparams['pndm_speedup'] = accelerate
158
+ batch = File2Batch.processed_input2batch([getitem(temp_dict)])
159
+ return batch
160
+
161
+ def evaluate_key(self, wav_path, key, auto_key):
162
+ if "f0_static" in hparams.keys():
163
+ f0_static = json.loads(hparams['f0_static'])
164
+ wav, mel = self.vocoder.wav2spec(wav_path)
165
+ input_f0 = get_pitch_parselmouth(wav, mel, hparams)[0]
166
+ pitch_time_temp = static_f0_time(input_f0)
167
+ eval_dict = {}
168
+ for trans_key in range(-12, 12):
169
+ eval_dict[trans_key] = compare_pitch(f0_static, pitch_time_temp, trans_key=trans_key)
170
+ sort_key = sorted(eval_dict, key=eval_dict.get, reverse=True)[:5]
171
+ print(f"推荐移调:{sort_key}")
172
+ if auto_key:
173
+ print(f"自动变调已启用,您的输入key被{sort_key[0]}key覆盖,控制参数为auto_key")
174
+ return sort_key[0]
175
+ else:
176
+ print("config缺少f0_staic,无法使用自动变调,可通过infer_tools/data_static添加")
177
+ return key
178
+
179
+
180
+ def getitem(item):
181
+ max_frames = hparams['max_frames']
182
+ spec = torch.Tensor(item['mel']).cpu()[:max_frames]
183
+ mel2ph = torch.LongTensor(item['mel2ph']).cpu()[:max_frames] if 'mel2ph' in item else None
184
+ f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams)
185
+ hubert = torch.Tensor(item['hubert'][:hparams['max_input_tokens']]).cpu()
186
+ pitch = torch.LongTensor(item.get("pitch")).cpu()[:max_frames]
187
+ sample = {
188
+ "id": item['id'],
189
+ "spk_id": item['spk_id'],
190
+ "item_name": item['item_name'],
191
+ "hubert": hubert,
192
+ "mel": spec,
193
+ "pitch": pitch,
194
+ "f0": f0,
195
+ "uv": uv,
196
+ "mel2ph": mel2ph,
197
+ "mel_nonpadding": spec.abs().sum(-1) > 0,
198
+ }
199
+ if hparams['use_energy_embed']:
200
+ sample['energy'] = item['energy']
201
+ return sample
infer_tools/slicer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ import torchaudio
4
+
5
+
6
+ class Slicer:
7
+ def __init__(self,
8
+ sr: int,
9
+ threshold: float = -40.,
10
+ min_length: int = 5000,
11
+ min_interval: int = 300,
12
+ hop_size: int = 20,
13
+ max_sil_kept: int = 5000):
14
+ if not min_length >= min_interval >= hop_size:
15
+ raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
16
+ if not max_sil_kept >= hop_size:
17
+ raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
18
+ min_interval = sr * min_interval / 1000
19
+ self.threshold = 10 ** (threshold / 20.)
20
+ self.hop_size = round(sr * hop_size / 1000)
21
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
22
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
23
+ self.min_interval = round(min_interval / self.hop_size)
24
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
25
+
26
+ def _apply_slice(self, waveform, begin, end):
27
+ if len(waveform.shape) > 1:
28
+ return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
29
+ else:
30
+ return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
31
+
32
+ # @timeit
33
+ def slice(self, waveform):
34
+ if len(waveform.shape) > 1:
35
+ samples = librosa.to_mono(waveform)
36
+ else:
37
+ samples = waveform
38
+ if samples.shape[0] <= self.min_length:
39
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
40
+ rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
41
+ sil_tags = []
42
+ silence_start = None
43
+ clip_start = 0
44
+ for i, rms in enumerate(rms_list):
45
+ # Keep looping while frame is silent.
46
+ if rms < self.threshold:
47
+ # Record start of silent frames.
48
+ if silence_start is None:
49
+ silence_start = i
50
+ continue
51
+ # Keep looping while frame is not silent and silence start has not been recorded.
52
+ if silence_start is None:
53
+ continue
54
+ # Clear recorded silence start if interval is not enough or clip is too short
55
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
56
+ need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
57
+ if not is_leading_silence and not need_slice_middle:
58
+ silence_start = None
59
+ continue
60
+ # Need slicing. Record the range of silent frames to be removed.
61
+ if i - silence_start <= self.max_sil_kept:
62
+ pos = rms_list[silence_start: i + 1].argmin() + silence_start
63
+ if silence_start == 0:
64
+ sil_tags.append((0, pos))
65
+ else:
66
+ sil_tags.append((pos, pos))
67
+ clip_start = pos
68
+ elif i - silence_start <= self.max_sil_kept * 2:
69
+ pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
70
+ pos += i - self.max_sil_kept
71
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
72
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
73
+ if silence_start == 0:
74
+ sil_tags.append((0, pos_r))
75
+ clip_start = pos_r
76
+ else:
77
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
78
+ clip_start = max(pos_r, pos)
79
+ else:
80
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
81
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
82
+ if silence_start == 0:
83
+ sil_tags.append((0, pos_r))
84
+ else:
85
+ sil_tags.append((pos_l, pos_r))
86
+ clip_start = pos_r
87
+ silence_start = None
88
+ # Deal with trailing silence.
89
+ total_frames = rms_list.shape[0]
90
+ if silence_start is not None and total_frames - silence_start >= self.min_interval:
91
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
92
+ pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
93
+ sil_tags.append((pos, total_frames + 1))
94
+ # Apply and return slices.
95
+ if len(sil_tags) == 0:
96
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
97
+ else:
98
+ chunks = []
99
+ # 第一段静音并非从头开始,补上有声片段
100
+ if sil_tags[0][0]:
101
+ chunks.append(
102
+ {"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
103
+ for i in range(0, len(sil_tags)):
104
+ # 标识有声片段(跳过第一段)
105
+ if i:
106
+ chunks.append({"slice": False,
107
+ "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
108
+ # 标识所有静音片段
109
+ chunks.append({"slice": True,
110
+ "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
111
+ # 最后一段静音并非结尾,补上结尾片段
112
+ if sil_tags[-1][1] * self.hop_size < len(waveform):
113
+ chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
114
+ chunk_dict = {}
115
+ for i in range(len(chunks)):
116
+ chunk_dict[str(i)] = chunks[i]
117
+ return chunk_dict
118
+
119
+
120
+ def cut(audio_path, db_thresh=-30, min_len=5000):
121
+ audio, sr = librosa.load(audio_path, sr=None)
122
+ slicer = Slicer(
123
+ sr=sr,
124
+ threshold=db_thresh,
125
+ min_length=min_len
126
+ )
127
+ chunks = slicer.slice(audio)
128
+ return chunks
129
+
130
+
131
+ def chunks2audio(audio_path, chunks):
132
+ chunks = dict(chunks)
133
+ audio, sr = torchaudio.load(audio_path)
134
+ if len(audio.shape) == 2 and audio.shape[1] >= 2:
135
+ audio = torch.mean(audio, dim=0).unsqueeze(0)
136
+ audio = audio.cpu().numpy()[0]
137
+ result = []
138
+ for k, v in chunks.items():
139
+ tag = v["split_time"].split(",")
140
+ if tag[0] != tag[1]:
141
+ result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
142
+ return result, sr
infer_tools/trans_key.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ head_list = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
3
+
4
+
5
+ def trans_f0_seq(feature_pit, transform):
6
+ feature_pit = feature_pit * 2 ** (transform / 12)
7
+ return round(feature_pit, 1)
8
+
9
+
10
+ def move_key(raw_data, mv_key):
11
+ head = raw_data[:-1]
12
+ body = int(raw_data[-1])
13
+ new_head_index = head_list.index(head) + mv_key
14
+ while new_head_index < 0:
15
+ body -= 1
16
+ new_head_index += 12
17
+ while new_head_index > 11:
18
+ body += 1
19
+ new_head_index -= 12
20
+ result_data = head_list[new_head_index] + str(body)
21
+ return result_data
22
+
23
+
24
+ def trans_key(raw_data, key):
25
+ for i in raw_data:
26
+ note_seq_list = i["note_seq"].split(" ")
27
+ new_note_seq_list = []
28
+ for note_seq in note_seq_list:
29
+ if note_seq != "rest":
30
+ new_note_seq = move_key(note_seq, key)
31
+ new_note_seq_list.append(new_note_seq)
32
+ else:
33
+ new_note_seq_list.append(note_seq)
34
+ i["note_seq"] = " ".join(new_note_seq_list)
35
+
36
+ f0_seq_list = i["f0_seq"].split(" ")
37
+ f0_seq_list = [float(x) for x in f0_seq_list]
38
+ new_f0_seq_list = []
39
+ for f0_seq in f0_seq_list:
40
+ new_f0_seq = trans_f0_seq(f0_seq, key)
41
+ new_f0_seq_list.append(str(new_f0_seq))
42
+ i["f0_seq"] = " ".join(new_f0_seq_list)
43
+ return raw_data
44
+
45
+
46
+ def trans_opencpop(raw_txt, res_txt, key):
47
+ if os.path.exists(raw_txt):
48
+ f_w = open(res_txt, "w", encoding='utf-8')
49
+ with open(raw_txt, "r", encoding='utf-8') as f:
50
+ raw_data = f.readlines()
51
+ for raw in raw_data:
52
+ raw_list = raw.split("|")
53
+ new_note_seq_list = []
54
+ for note_seq in raw_list[3].split(" "):
55
+ if note_seq != "rest":
56
+ note_seq = note_seq.split("/")[0] if "/" in note_seq else note_seq
57
+ new_note_seq = move_key(note_seq, key)
58
+ new_note_seq_list.append(new_note_seq)
59
+ else:
60
+ new_note_seq_list.append(note_seq)
61
+ raw_list[3] = " ".join(new_note_seq_list)
62
+ f_w.write("|".join(raw_list))
63
+ f_w.close()
64
+ print("opencpop标注文件转换完毕")
65
+ else:
66
+ print("未发现opencpop标注文件,请检查路径")
67
+
modules/__pycache__/encoder.cpython-310.pyc ADDED
Binary file (7.19 kB). View file
 
modules/__pycache__/encoder.cpython-38.pyc ADDED
Binary file (7.17 kB). View file
 
modules/commons/__pycache__/common_layers.cpython-310.pyc ADDED
Binary file (18.6 kB). View file
 
modules/commons/__pycache__/common_layers.cpython-38.pyc ADDED
Binary file (18.9 kB). View file
 
modules/commons/__pycache__/ssim.cpython-310.pyc ADDED
Binary file (2.67 kB). View file
 
modules/commons/__pycache__/ssim.cpython-38.pyc ADDED
Binary file (2.68 kB). View file
 
modules/commons/common_layers.py ADDED
@@ -0,0 +1,675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import torch.onnx.operators
6
+ from torch import nn
7
+ from torch.nn import Parameter
8
+
9
+ import utils
10
+
11
+
12
+ class Reshape(nn.Module):
13
+ def __init__(self, *args):
14
+ super(Reshape, self).__init__()
15
+ self.shape = args
16
+
17
+ def forward(self, x):
18
+ return x.view(self.shape)
19
+
20
+
21
+ class Permute(nn.Module):
22
+ def __init__(self, *args):
23
+ super(Permute, self).__init__()
24
+ self.args = args
25
+
26
+ def forward(self, x):
27
+ return x.permute(self.args)
28
+
29
+
30
+ class LinearNorm(torch.nn.Module):
31
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
32
+ super(LinearNorm, self).__init__()
33
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
34
+
35
+ torch.nn.init.xavier_uniform_(
36
+ self.linear_layer.weight,
37
+ gain=torch.nn.init.calculate_gain(w_init_gain))
38
+
39
+ def forward(self, x):
40
+ return self.linear_layer(x)
41
+
42
+
43
+ class ConvNorm(torch.nn.Module):
44
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
45
+ padding=None, dilation=1, bias=True, w_init_gain='linear'):
46
+ super(ConvNorm, self).__init__()
47
+ if padding is None:
48
+ assert (kernel_size % 2 == 1)
49
+ padding = int(dilation * (kernel_size - 1) / 2)
50
+
51
+ self.conv = torch.nn.Conv1d(in_channels, out_channels,
52
+ kernel_size=kernel_size, stride=stride,
53
+ padding=padding, dilation=dilation,
54
+ bias=bias)
55
+
56
+ torch.nn.init.xavier_uniform_(
57
+ self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
58
+
59
+ def forward(self, signal):
60
+ conv_signal = self.conv(signal)
61
+ return conv_signal
62
+
63
+
64
+ def Embedding(num_embeddings, embedding_dim, padding_idx=None):
65
+ m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
66
+ nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
67
+ if padding_idx is not None:
68
+ nn.init.constant_(m.weight[padding_idx], 0)
69
+ return m
70
+
71
+
72
+ def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
73
+ if not export and torch.cuda.is_available():
74
+ try:
75
+ from apex.normalization import FusedLayerNorm
76
+ return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
77
+ except ImportError:
78
+ pass
79
+ return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
80
+
81
+
82
+ def Linear(in_features, out_features, bias=True):
83
+ m = nn.Linear(in_features, out_features, bias)
84
+ nn.init.xavier_uniform_(m.weight)
85
+ if bias:
86
+ nn.init.constant_(m.bias, 0.)
87
+ return m
88
+
89
+
90
+ class SinusoidalPositionalEmbedding(nn.Module):
91
+ """This module produces sinusoidal positional embeddings of any length.
92
+
93
+ Padding symbols are ignored.
94
+ """
95
+
96
+ def __init__(self, embedding_dim, padding_idx, init_size=1024):
97
+ super().__init__()
98
+ self.embedding_dim = embedding_dim
99
+ self.padding_idx = padding_idx
100
+ self.weights = SinusoidalPositionalEmbedding.get_embedding(
101
+ init_size,
102
+ embedding_dim,
103
+ padding_idx,
104
+ )
105
+ self.register_buffer('_float_tensor', torch.FloatTensor(1))
106
+
107
+ @staticmethod
108
+ def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
109
+ """Build sinusoidal embeddings.
110
+
111
+ This matches the implementation in tensor2tensor, but differs slightly
112
+ from the description in Section 3.5 of "Attention Is All You Need".
113
+ """
114
+ half_dim = embedding_dim // 2
115
+ emb = math.log(10000) / (half_dim - 1)
116
+ emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
117
+ emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
118
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
119
+ if embedding_dim % 2 == 1:
120
+ # zero pad
121
+ emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
122
+ if padding_idx is not None:
123
+ emb[padding_idx, :] = 0
124
+ return emb
125
+
126
+ def forward(self, input, incremental_state=None, timestep=None, positions=None, **kwargs):
127
+ """Input is expected to be of size [bsz x seqlen]."""
128
+ bsz, seq_len = input.shape[:2]
129
+ max_pos = self.padding_idx + 1 + seq_len
130
+ if self.weights is None or max_pos > self.weights.size(0):
131
+ # recompute/expand embeddings if needed
132
+ self.weights = SinusoidalPositionalEmbedding.get_embedding(
133
+ max_pos,
134
+ self.embedding_dim,
135
+ self.padding_idx,
136
+ )
137
+ self.weights = self.weights.to(self._float_tensor)
138
+
139
+ if incremental_state is not None:
140
+ # positions is the same for every token when decoding a single step
141
+ pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
142
+ return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
143
+
144
+ positions = utils.make_positions(input, self.padding_idx) if positions is None else positions
145
+ return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()
146
+
147
+ def max_positions(self):
148
+ """Maximum number of supported positions."""
149
+ return int(1e5) # an arbitrary large number
150
+
151
+
152
+ class ConvTBC(nn.Module):
153
+ def __init__(self, in_channels, out_channels, kernel_size, padding=0):
154
+ super(ConvTBC, self).__init__()
155
+ self.in_channels = in_channels
156
+ self.out_channels = out_channels
157
+ self.kernel_size = kernel_size
158
+ self.padding = padding
159
+
160
+ self.weight = torch.nn.Parameter(torch.Tensor(
161
+ self.kernel_size, in_channels, out_channels))
162
+ self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
163
+
164
+ def forward(self, input):
165
+ return torch.conv_tbc(input.contiguous(), self.weight, self.bias, self.padding)
166
+
167
+
168
+ class MultiheadAttention(nn.Module):
169
+ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True,
170
+ add_bias_kv=False, add_zero_attn=False, self_attention=False,
171
+ encoder_decoder_attention=False):
172
+ super().__init__()
173
+ self.embed_dim = embed_dim
174
+ self.kdim = kdim if kdim is not None else embed_dim
175
+ self.vdim = vdim if vdim is not None else embed_dim
176
+ self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
177
+
178
+ self.num_heads = num_heads
179
+ self.dropout = dropout
180
+ self.head_dim = embed_dim // num_heads
181
+ assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
182
+ self.scaling = self.head_dim ** -0.5
183
+
184
+ self.self_attention = self_attention
185
+ self.encoder_decoder_attention = encoder_decoder_attention
186
+
187
+ assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
188
+ 'value to be of the same size'
189
+
190
+ if self.qkv_same_dim:
191
+ self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
192
+ else:
193
+ self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
194
+ self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
195
+ self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
196
+
197
+ if bias:
198
+ self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
199
+ else:
200
+ self.register_parameter('in_proj_bias', None)
201
+
202
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
203
+
204
+ if add_bias_kv:
205
+ self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
206
+ self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
207
+ else:
208
+ self.bias_k = self.bias_v = None
209
+
210
+ self.add_zero_attn = add_zero_attn
211
+
212
+ self.reset_parameters()
213
+
214
+ self.enable_torch_version = False
215
+ if hasattr(F, "multi_head_attention_forward"):
216
+ self.enable_torch_version = True
217
+ else:
218
+ self.enable_torch_version = False
219
+ self.last_attn_probs = None
220
+
221
+ def reset_parameters(self):
222
+ if self.qkv_same_dim:
223
+ nn.init.xavier_uniform_(self.in_proj_weight)
224
+ else:
225
+ nn.init.xavier_uniform_(self.k_proj_weight)
226
+ nn.init.xavier_uniform_(self.v_proj_weight)
227
+ nn.init.xavier_uniform_(self.q_proj_weight)
228
+
229
+ nn.init.xavier_uniform_(self.out_proj.weight)
230
+ if self.in_proj_bias is not None:
231
+ nn.init.constant_(self.in_proj_bias, 0.)
232
+ nn.init.constant_(self.out_proj.bias, 0.)
233
+ if self.bias_k is not None:
234
+ nn.init.xavier_normal_(self.bias_k)
235
+ if self.bias_v is not None:
236
+ nn.init.xavier_normal_(self.bias_v)
237
+
238
+ def forward(
239
+ self,
240
+ query, key, value,
241
+ key_padding_mask=None,
242
+ incremental_state=None,
243
+ need_weights=True,
244
+ static_kv=False,
245
+ attn_mask=None,
246
+ before_softmax=False,
247
+ need_head_weights=False,
248
+ enc_dec_attn_constraint_mask=None,
249
+ reset_attn_weight=None
250
+ ):
251
+ """Input shape: Time x Batch x Channel
252
+
253
+ Args:
254
+ key_padding_mask (ByteTensor, optional): mask to exclude
255
+ keys that are pads, of shape `(batch, src_len)`, where
256
+ padding elements are indicated by 1s.
257
+ need_weights (bool, optional): return the attention weights,
258
+ averaged over heads (default: False).
259
+ attn_mask (ByteTensor, optional): typically used to
260
+ implement causal attention, where the mask prevents the
261
+ attention from looking forward in time (default: None).
262
+ before_softmax (bool, optional): return the raw attention
263
+ weights and values before the attention softmax.
264
+ need_head_weights (bool, optional): return the attention
265
+ weights for each head. Implies *need_weights*. Default:
266
+ return the average attention weights over all heads.
267
+ """
268
+ if need_head_weights:
269
+ need_weights = True
270
+
271
+ tgt_len, bsz, embed_dim = query.size()
272
+ assert embed_dim == self.embed_dim
273
+ assert list(query.size()) == [tgt_len, bsz, embed_dim]
274
+
275
+ if self.enable_torch_version and incremental_state is None and not static_kv and reset_attn_weight is None:
276
+ if self.qkv_same_dim:
277
+ return F.multi_head_attention_forward(query, key, value,
278
+ self.embed_dim, self.num_heads,
279
+ self.in_proj_weight,
280
+ self.in_proj_bias, self.bias_k, self.bias_v,
281
+ self.add_zero_attn, self.dropout,
282
+ self.out_proj.weight, self.out_proj.bias,
283
+ self.training, key_padding_mask, need_weights,
284
+ attn_mask)
285
+ else:
286
+ return F.multi_head_attention_forward(query, key, value,
287
+ self.embed_dim, self.num_heads,
288
+ torch.empty([0]),
289
+ self.in_proj_bias, self.bias_k, self.bias_v,
290
+ self.add_zero_attn, self.dropout,
291
+ self.out_proj.weight, self.out_proj.bias,
292
+ self.training, key_padding_mask, need_weights,
293
+ attn_mask, use_separate_proj_weight=True,
294
+ q_proj_weight=self.q_proj_weight,
295
+ k_proj_weight=self.k_proj_weight,
296
+ v_proj_weight=self.v_proj_weight)
297
+
298
+ if incremental_state is not None:
299
+ print('Not implemented error.')
300
+ exit()
301
+ else:
302
+ saved_state = None
303
+
304
+ if self.self_attention:
305
+ # self-attention
306
+ q, k, v = self.in_proj_qkv(query)
307
+ elif self.encoder_decoder_attention:
308
+ # encoder-decoder attention
309
+ q = self.in_proj_q(query)
310
+ if key is None:
311
+ assert value is None
312
+ k = v = None
313
+ else:
314
+ k = self.in_proj_k(key)
315
+ v = self.in_proj_v(key)
316
+
317
+ else:
318
+ q = self.in_proj_q(query)
319
+ k = self.in_proj_k(key)
320
+ v = self.in_proj_v(value)
321
+ q *= self.scaling
322
+
323
+ if self.bias_k is not None:
324
+ assert self.bias_v is not None
325
+ k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
326
+ v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
327
+ if attn_mask is not None:
328
+ attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
329
+ if key_padding_mask is not None:
330
+ key_padding_mask = torch.cat(
331
+ [key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)
332
+
333
+ q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
334
+ if k is not None:
335
+ k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
336
+ if v is not None:
337
+ v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
338
+
339
+ if saved_state is not None:
340
+ print('Not implemented error.')
341
+ exit()
342
+
343
+ src_len = k.size(1)
344
+
345
+ # This is part of a workaround to get around fork/join parallelism
346
+ # not supporting Optional types.
347
+ if key_padding_mask is not None and key_padding_mask.shape == torch.Size([]):
348
+ key_padding_mask = None
349
+
350
+ if key_padding_mask is not None:
351
+ assert key_padding_mask.size(0) == bsz
352
+ assert key_padding_mask.size(1) == src_len
353
+
354
+ if self.add_zero_attn:
355
+ src_len += 1
356
+ k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
357
+ v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
358
+ if attn_mask is not None:
359
+ attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
360
+ if key_padding_mask is not None:
361
+ key_padding_mask = torch.cat(
362
+ [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
363
+
364
+ attn_weights = torch.bmm(q, k.transpose(1, 2))
365
+ attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
366
+
367
+ assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
368
+
369
+ if attn_mask is not None:
370
+ if len(attn_mask.shape) == 2:
371
+ attn_mask = attn_mask.unsqueeze(0)
372
+ elif len(attn_mask.shape) == 3:
373
+ attn_mask = attn_mask[:, None].repeat([1, self.num_heads, 1, 1]).reshape(
374
+ bsz * self.num_heads, tgt_len, src_len)
375
+ attn_weights = attn_weights + attn_mask
376
+
377
+ if enc_dec_attn_constraint_mask is not None: # bs x head x L_kv
378
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
379
+ attn_weights = attn_weights.masked_fill(
380
+ enc_dec_attn_constraint_mask.unsqueeze(2).bool(),
381
+ -1e9,
382
+ )
383
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
384
+
385
+ if key_padding_mask is not None:
386
+ # don't attend to padding symbols
387
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
388
+ attn_weights = attn_weights.masked_fill(
389
+ key_padding_mask.unsqueeze(1).unsqueeze(2),
390
+ -1e9,
391
+ )
392
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
393
+
394
+ attn_logits = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
395
+
396
+ if before_softmax:
397
+ return attn_weights, v
398
+
399
+ attn_weights_float = utils.softmax(attn_weights, dim=-1)
400
+ attn_weights = attn_weights_float.type_as(attn_weights)
401
+ attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
402
+
403
+ if reset_attn_weight is not None:
404
+ if reset_attn_weight:
405
+ self.last_attn_probs = attn_probs.detach()
406
+ else:
407
+ assert self.last_attn_probs is not None
408
+ attn_probs = self.last_attn_probs
409
+ attn = torch.bmm(attn_probs, v)
410
+ assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
411
+ attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
412
+ attn = self.out_proj(attn)
413
+
414
+ if need_weights:
415
+ attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
416
+ if not need_head_weights:
417
+ # average attention weights over heads
418
+ attn_weights = attn_weights.mean(dim=0)
419
+ else:
420
+ attn_weights = None
421
+
422
+ return attn, (attn_weights, attn_logits)
423
+
424
+ def in_proj_qkv(self, query):
425
+ return self._in_proj(query).chunk(3, dim=-1)
426
+
427
+ def in_proj_q(self, query):
428
+ if self.qkv_same_dim:
429
+ return self._in_proj(query, end=self.embed_dim)
430
+ else:
431
+ bias = self.in_proj_bias
432
+ if bias is not None:
433
+ bias = bias[:self.embed_dim]
434
+ return F.linear(query, self.q_proj_weight, bias)
435
+
436
+ def in_proj_k(self, key):
437
+ if self.qkv_same_dim:
438
+ return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)
439
+ else:
440
+ weight = self.k_proj_weight
441
+ bias = self.in_proj_bias
442
+ if bias is not None:
443
+ bias = bias[self.embed_dim:2 * self.embed_dim]
444
+ return F.linear(key, weight, bias)
445
+
446
+ def in_proj_v(self, value):
447
+ if self.qkv_same_dim:
448
+ return self._in_proj(value, start=2 * self.embed_dim)
449
+ else:
450
+ weight = self.v_proj_weight
451
+ bias = self.in_proj_bias
452
+ if bias is not None:
453
+ bias = bias[2 * self.embed_dim:]
454
+ return F.linear(value, weight, bias)
455
+
456
+ def _in_proj(self, input, start=0, end=None):
457
+ weight = self.in_proj_weight
458
+ bias = self.in_proj_bias
459
+ weight = weight[start:end, :]
460
+ if bias is not None:
461
+ bias = bias[start:end]
462
+ return F.linear(input, weight, bias)
463
+
464
+ def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
465
+ return attn_weights
466
+
467
+
468
+ class Swish(torch.autograd.Function):
469
+ @staticmethod
470
+ def forward(ctx, i):
471
+ result = i * torch.sigmoid(i)
472
+ ctx.save_for_backward(i)
473
+ return result
474
+
475
+ @staticmethod
476
+ def backward(ctx, grad_output):
477
+ i = ctx.saved_variables[0]
478
+ sigmoid_i = torch.sigmoid(i)
479
+ return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
480
+
481
+
482
+ class CustomSwish(nn.Module):
483
+ def forward(self, input_tensor):
484
+ return Swish.apply(input_tensor)
485
+
486
+
487
+ class Mish(nn.Module):
488
+ def forward(self, x):
489
+ return x * torch.tanh(F.softplus(x))
490
+
491
+
492
+ class TransformerFFNLayer(nn.Module):
493
+ def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'):
494
+ super().__init__()
495
+ self.kernel_size = kernel_size
496
+ self.dropout = dropout
497
+ self.act = act
498
+ if padding == 'SAME':
499
+ self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
500
+ elif padding == 'LEFT':
501
+ self.ffn_1 = nn.Sequential(
502
+ nn.ConstantPad1d((kernel_size - 1, 0), 0.0),
503
+ nn.Conv1d(hidden_size, filter_size, kernel_size)
504
+ )
505
+ self.ffn_2 = Linear(filter_size, hidden_size)
506
+ if self.act == 'swish':
507
+ self.swish_fn = CustomSwish()
508
+
509
+ def forward(self, x, incremental_state=None):
510
+ # x: T x B x C
511
+ if incremental_state is not None:
512
+ assert incremental_state is None, 'Nar-generation does not allow this.'
513
+ exit(1)
514
+
515
+ x = self.ffn_1(x.permute(1, 2, 0)).permute(2, 0, 1)
516
+ x = x * self.kernel_size ** -0.5
517
+
518
+ if incremental_state is not None:
519
+ x = x[-1:]
520
+ if self.act == 'gelu':
521
+ x = F.gelu(x)
522
+ if self.act == 'relu':
523
+ x = F.relu(x)
524
+ if self.act == 'swish':
525
+ x = self.swish_fn(x)
526
+ x = F.dropout(x, self.dropout, training=self.training)
527
+ x = self.ffn_2(x)
528
+ return x
529
+
530
+
531
+ class BatchNorm1dTBC(nn.Module):
532
+ def __init__(self, c):
533
+ super(BatchNorm1dTBC, self).__init__()
534
+ self.bn = nn.BatchNorm1d(c)
535
+
536
+ def forward(self, x):
537
+ """
538
+
539
+ :param x: [T, B, C]
540
+ :return: [T, B, C]
541
+ """
542
+ x = x.permute(1, 2, 0) # [B, C, T]
543
+ x = self.bn(x) # [B, C, T]
544
+ x = x.permute(2, 0, 1) # [T, B, C]
545
+ return x
546
+
547
+
548
+ class EncSALayer(nn.Module):
549
+ def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
550
+ relu_dropout=0.1, kernel_size=9, padding='SAME', norm='ln', act='gelu'):
551
+ super().__init__()
552
+ self.c = c
553
+ self.dropout = dropout
554
+ self.num_heads = num_heads
555
+ if num_heads > 0:
556
+ if norm == 'ln':
557
+ self.layer_norm1 = LayerNorm(c)
558
+ elif norm == 'bn':
559
+ self.layer_norm1 = BatchNorm1dTBC(c)
560
+ self.self_attn = MultiheadAttention(
561
+ self.c, num_heads, self_attention=True, dropout=attention_dropout, bias=False,
562
+ )
563
+ if norm == 'ln':
564
+ self.layer_norm2 = LayerNorm(c)
565
+ elif norm == 'bn':
566
+ self.layer_norm2 = BatchNorm1dTBC(c)
567
+ self.ffn = TransformerFFNLayer(
568
+ c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act)
569
+
570
+ def forward(self, x, encoder_padding_mask=None, **kwargs):
571
+ layer_norm_training = kwargs.get('layer_norm_training', None)
572
+ if layer_norm_training is not None:
573
+ self.layer_norm1.training = layer_norm_training
574
+ self.layer_norm2.training = layer_norm_training
575
+ if self.num_heads > 0:
576
+ residual = x
577
+ x = self.layer_norm1(x)
578
+ x, _, = self.self_attn(
579
+ query=x,
580
+ key=x,
581
+ value=x,
582
+ key_padding_mask=encoder_padding_mask
583
+ )
584
+ x = F.dropout(x, self.dropout, training=self.training)
585
+ x = residual + x
586
+ x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
587
+
588
+ residual = x
589
+ x = self.layer_norm2(x)
590
+ x = self.ffn(x)
591
+ x = F.dropout(x, self.dropout, training=self.training)
592
+ x = residual + x
593
+ x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
594
+ return x
595
+
596
+
597
+ class DecSALayer(nn.Module):
598
+ def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_dropout=0.1, kernel_size=9, act='gelu'):
599
+ super().__init__()
600
+ self.c = c
601
+ self.dropout = dropout
602
+ self.layer_norm1 = LayerNorm(c)
603
+ self.self_attn = MultiheadAttention(
604
+ c, num_heads, self_attention=True, dropout=attention_dropout, bias=False
605
+ )
606
+ self.layer_norm2 = LayerNorm(c)
607
+ self.encoder_attn = MultiheadAttention(
608
+ c, num_heads, encoder_decoder_attention=True, dropout=attention_dropout, bias=False,
609
+ )
610
+ self.layer_norm3 = LayerNorm(c)
611
+ self.ffn = TransformerFFNLayer(
612
+ c, 4 * c, padding='LEFT', kernel_size=kernel_size, dropout=relu_dropout, act=act)
613
+
614
+ def forward(
615
+ self,
616
+ x,
617
+ encoder_out=None,
618
+ encoder_padding_mask=None,
619
+ incremental_state=None,
620
+ self_attn_mask=None,
621
+ self_attn_padding_mask=None,
622
+ attn_out=None,
623
+ reset_attn_weight=None,
624
+ **kwargs,
625
+ ):
626
+ layer_norm_training = kwargs.get('layer_norm_training', None)
627
+ if layer_norm_training is not None:
628
+ self.layer_norm1.training = layer_norm_training
629
+ self.layer_norm2.training = layer_norm_training
630
+ self.layer_norm3.training = layer_norm_training
631
+ residual = x
632
+ x = self.layer_norm1(x)
633
+ x, _ = self.self_attn(
634
+ query=x,
635
+ key=x,
636
+ value=x,
637
+ key_padding_mask=self_attn_padding_mask,
638
+ incremental_state=incremental_state,
639
+ attn_mask=self_attn_mask
640
+ )
641
+ x = F.dropout(x, self.dropout, training=self.training)
642
+ x = residual + x
643
+
644
+ residual = x
645
+ x = self.layer_norm2(x)
646
+ if encoder_out is not None:
647
+ x, attn = self.encoder_attn(
648
+ query=x,
649
+ key=encoder_out,
650
+ value=encoder_out,
651
+ key_padding_mask=encoder_padding_mask,
652
+ incremental_state=incremental_state,
653
+ static_kv=True,
654
+ enc_dec_attn_constraint_mask=None,
655
+ # utils.get_incremental_state(self, incremental_state, 'enc_dec_attn_constraint_mask'),
656
+ reset_attn_weight=reset_attn_weight
657
+ )
658
+ attn_logits = attn[1]
659
+ else:
660
+ assert attn_out is not None
661
+ x = self.encoder_attn.in_proj_v(attn_out.transpose(0, 1))
662
+ attn_logits = None
663
+ x = F.dropout(x, self.dropout, training=self.training)
664
+ x = residual + x
665
+
666
+ residual = x
667
+ x = self.layer_norm3(x)
668
+ x = self.ffn(x, incremental_state=incremental_state)
669
+ x = F.dropout(x, self.dropout, training=self.training)
670
+ x = residual + x
671
+ # if len(attn_logits.size()) > 3:
672
+ # indices = attn_logits.softmax(-1).max(-1).values.sum(-1).argmax(-1)
673
+ # attn_logits = attn_logits.gather(1,
674
+ # indices[:, None, None, None].repeat(1, 1, attn_logits.size(-2), attn_logits.size(-1))).squeeze(1)
675
+ return x, attn_logits
modules/commons/ssim.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Adapted from https://github.com/Po-Hsun-Su/pytorch-ssim
3
+ """
4
+
5
+ from math import exp
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ from torch.autograd import Variable
10
+
11
+
12
+ def gaussian(window_size, sigma):
13
+ gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
14
+ return gauss / gauss.sum()
15
+
16
+
17
+ def create_window(window_size, channel):
18
+ _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
19
+ _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
20
+ window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
21
+ return window
22
+
23
+
24
+ def _ssim(img1, img2, window, window_size, channel, size_average=True):
25
+ mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
26
+ mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
27
+
28
+ mu1_sq = mu1.pow(2)
29
+ mu2_sq = mu2.pow(2)
30
+ mu1_mu2 = mu1 * mu2
31
+
32
+ sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
33
+ sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
34
+ sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
35
+
36
+ C1 = 0.01 ** 2
37
+ C2 = 0.03 ** 2
38
+
39
+ ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
40
+
41
+ if size_average:
42
+ return ssim_map.mean()
43
+ else:
44
+ return ssim_map.mean(1)
45
+
46
+
47
+ class SSIM(torch.nn.Module):
48
+ def __init__(self, window_size=11, size_average=True):
49
+ super(SSIM, self).__init__()
50
+ self.window_size = window_size
51
+ self.size_average = size_average
52
+ self.channel = 1
53
+ self.window = create_window(window_size, self.channel)
54
+
55
+ def forward(self, img1, img2):
56
+ (_, channel, _, _) = img1.size()
57
+
58
+ if channel == self.channel and self.window.data.type() == img1.data.type():
59
+ window = self.window
60
+ else:
61
+ window = create_window(self.window_size, channel)
62
+
63
+ if img1.is_cuda:
64
+ window = window.cuda(img1.get_device())
65
+ window = window.type_as(img1)
66
+
67
+ self.window = window
68
+ self.channel = channel
69
+
70
+ return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
71
+
72
+
73
+ window = None
74
+
75
+
76
+ def ssim(img1, img2, window_size=11, size_average=True):
77
+ (_, channel, _, _) = img1.size()
78
+ global window
79
+ if window is None:
80
+ window = create_window(window_size, channel)
81
+ if img1.is_cuda:
82
+ window = window.cuda(img1.get_device())
83
+ window = window.type_as(img1)
84
+ return _ssim(img1, img2, window, window_size, channel, size_average)
modules/diff/__pycache__/diffusion.cpython-310.pyc ADDED
Binary file (11 kB). View file
 
modules/diff/__pycache__/diffusion.cpython-38.pyc ADDED
Binary file (11 kB). View file
 
modules/diff/__pycache__/net.cpython-310.pyc ADDED
Binary file (4.57 kB). View file
 
modules/diff/__pycache__/net.cpython-38.pyc ADDED
Binary file (4.61 kB). View file
 
modules/diff/diffusion.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+ from functools import partial
3
+ from inspect import isfunction
4
+
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn.functional as F
8
+ from torch import nn
9
+ from tqdm import tqdm
10
+
11
+ from modules.encoder import SvcEncoder
12
+ from training.train_pipeline import Batch2Loss
13
+ from utils.hparams import hparams
14
+
15
+
16
+ def exists(x):
17
+ return x is not None
18
+
19
+
20
+ def default(val, d):
21
+ if exists(val):
22
+ return val
23
+ return d() if isfunction(d) else d
24
+
25
+
26
+ # gaussian diffusion trainer class
27
+
28
+ def extract(a, t, x_shape):
29
+ b, *_ = t.shape
30
+ out = a.gather(-1, t)
31
+ return out.reshape(b, *((1,) * (len(x_shape) - 1)))
32
+
33
+
34
+ def noise_like(shape, device, repeat=False):
35
+ repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
36
+ noise = lambda: torch.randn(shape, device=device)
37
+ return repeat_noise() if repeat else noise()
38
+
39
+
40
+ def linear_beta_schedule(timesteps, max_beta=hparams.get('max_beta', 0.01)):
41
+ """
42
+ linear schedule
43
+ """
44
+ betas = np.linspace(1e-4, max_beta, timesteps)
45
+ return betas
46
+
47
+
48
+ def cosine_beta_schedule(timesteps, s=0.008):
49
+ """
50
+ cosine schedule
51
+ as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
52
+ """
53
+ steps = timesteps + 1
54
+ x = np.linspace(0, steps, steps)
55
+ alphas_cumprod = np.cos(((x / steps) + s) / (1 + s) * np.pi * 0.5) ** 2
56
+ alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
57
+ betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
58
+ return np.clip(betas, a_min=0, a_max=0.999)
59
+
60
+
61
+ beta_schedule = {
62
+ "cosine": cosine_beta_schedule,
63
+ "linear": linear_beta_schedule,
64
+ }
65
+
66
+
67
+ class GaussianDiffusion(nn.Module):
68
+ def __init__(self, phone_encoder, out_dims, denoise_fn,
69
+ timesteps=1000, K_step=1000, loss_type=hparams.get('diff_loss_type', 'l1'), betas=None, spec_min=None,
70
+ spec_max=None):
71
+ super().__init__()
72
+ self.denoise_fn = denoise_fn
73
+ self.fs2 = SvcEncoder(phone_encoder, out_dims)
74
+ self.mel_bins = out_dims
75
+
76
+ if exists(betas):
77
+ betas = betas.detach().cpu().numpy() if isinstance(betas, torch.Tensor) else betas
78
+ else:
79
+ if 'schedule_type' in hparams.keys():
80
+ betas = beta_schedule[hparams['schedule_type']](timesteps)
81
+ else:
82
+ betas = cosine_beta_schedule(timesteps)
83
+
84
+ alphas = 1. - betas
85
+ alphas_cumprod = np.cumprod(alphas, axis=0)
86
+ alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
87
+
88
+ timesteps, = betas.shape
89
+ self.num_timesteps = int(timesteps)
90
+ self.K_step = K_step
91
+ self.loss_type = loss_type
92
+
93
+ self.noise_list = deque(maxlen=4)
94
+
95
+ to_torch = partial(torch.tensor, dtype=torch.float32)
96
+
97
+ self.register_buffer('betas', to_torch(betas))
98
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
99
+ self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
100
+
101
+ # calculations for diffusion q(x_t | x_{t-1}) and others
102
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
103
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
104
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
105
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
106
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
107
+
108
+ # calculations for posterior q(x_{t-1} | x_t, x_0)
109
+ posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
110
+ # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
111
+ self.register_buffer('posterior_variance', to_torch(posterior_variance))
112
+ # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
113
+ self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
114
+ self.register_buffer('posterior_mean_coef1', to_torch(
115
+ betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
116
+ self.register_buffer('posterior_mean_coef2', to_torch(
117
+ (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
118
+
119
+ self.register_buffer('spec_min', torch.FloatTensor(spec_min)[None, None, :hparams['keep_bins']])
120
+ self.register_buffer('spec_max', torch.FloatTensor(spec_max)[None, None, :hparams['keep_bins']])
121
+
122
+ def q_mean_variance(self, x_start, t):
123
+ mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
124
+ variance = extract(1. - self.alphas_cumprod, t, x_start.shape)
125
+ log_variance = extract(self.log_one_minus_alphas_cumprod, t, x_start.shape)
126
+ return mean, variance, log_variance
127
+
128
+ def predict_start_from_noise(self, x_t, t, noise):
129
+ return (
130
+ extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
131
+ extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
132
+ )
133
+
134
+ def q_posterior(self, x_start, x_t, t):
135
+ posterior_mean = (
136
+ extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
137
+ extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
138
+ )
139
+ posterior_variance = extract(self.posterior_variance, t, x_t.shape)
140
+ posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
141
+ return posterior_mean, posterior_variance, posterior_log_variance_clipped
142
+
143
+ def p_mean_variance(self, x, t, cond, clip_denoised: bool):
144
+ noise_pred = self.denoise_fn(x, t, cond=cond)
145
+ x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
146
+
147
+ if clip_denoised:
148
+ x_recon.clamp_(-1., 1.)
149
+
150
+ model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
151
+ return model_mean, posterior_variance, posterior_log_variance
152
+
153
+ @torch.no_grad()
154
+ def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
155
+ b, *_, device = *x.shape, x.device
156
+ model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond, clip_denoised=clip_denoised)
157
+ noise = noise_like(x.shape, device, repeat_noise)
158
+ # no noise when t == 0
159
+ nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
160
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
161
+
162
+ @torch.no_grad()
163
+ def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repeat_noise=False):
164
+ """
165
+ Use the PLMS method from [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778).
166
+ """
167
+
168
+ def get_x_pred(x, noise_t, t):
169
+ a_t = extract(self.alphas_cumprod, t, x.shape)
170
+ a_prev = extract(self.alphas_cumprod, torch.max(t - interval, torch.zeros_like(t)), x.shape)
171
+ a_t_sq, a_prev_sq = a_t.sqrt(), a_prev.sqrt()
172
+
173
+ x_delta = (a_prev - a_t) * ((1 / (a_t_sq * (a_t_sq + a_prev_sq))) * x - 1 / (
174
+ a_t_sq * (((1 - a_prev) * a_t).sqrt() + ((1 - a_t) * a_prev).sqrt())) * noise_t)
175
+ x_pred = x + x_delta
176
+
177
+ return x_pred
178
+
179
+ noise_list = self.noise_list
180
+ noise_pred = self.denoise_fn(x, t, cond=cond)
181
+
182
+ if len(noise_list) == 0:
183
+ x_pred = get_x_pred(x, noise_pred, t)
184
+ noise_pred_prev = self.denoise_fn(x_pred, max(t - interval, 0), cond=cond)
185
+ noise_pred_prime = (noise_pred + noise_pred_prev) / 2
186
+ elif len(noise_list) == 1:
187
+ noise_pred_prime = (3 * noise_pred - noise_list[-1]) / 2
188
+ elif len(noise_list) == 2:
189
+ noise_pred_prime = (23 * noise_pred - 16 * noise_list[-1] + 5 * noise_list[-2]) / 12
190
+ elif len(noise_list) >= 3:
191
+ noise_pred_prime = (55 * noise_pred - 59 * noise_list[-1] + 37 * noise_list[-2] - 9 * noise_list[-3]) / 24
192
+
193
+ x_prev = get_x_pred(x, noise_pred_prime, t)
194
+ noise_list.append(noise_pred)
195
+
196
+ return x_prev
197
+
198
+ def q_sample(self, x_start, t, noise=None):
199
+ noise = default(noise, lambda: torch.randn_like(x_start))
200
+ return (
201
+ extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
202
+ extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
203
+ )
204
+
205
+ def p_losses(self, x_start, t, cond, noise=None, nonpadding=None):
206
+ noise = default(noise, lambda: torch.randn_like(x_start))
207
+
208
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
209
+ x_recon = self.denoise_fn(x_noisy, t, cond)
210
+
211
+ if self.loss_type == 'l1':
212
+ if nonpadding is not None:
213
+ loss = ((noise - x_recon).abs() * nonpadding.unsqueeze(1)).mean()
214
+ else:
215
+ # print('are you sure w/o nonpadding?')
216
+ loss = (noise - x_recon).abs().mean()
217
+
218
+ elif self.loss_type == 'l2':
219
+ loss = F.mse_loss(noise, x_recon)
220
+ else:
221
+ raise NotImplementedError()
222
+
223
+ return loss
224
+
225
+ def forward(self, hubert, mel2ph=None, spk_embed=None,
226
+ ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
227
+ '''
228
+ conditioning diffusion, use fastspeech2 encoder output as the condition
229
+ '''
230
+ ret = self.fs2(hubert, mel2ph, spk_embed, None, f0, uv, energy,
231
+ skip_decoder=True, infer=infer, **kwargs)
232
+ cond = ret['decoder_inp'].transpose(1, 2)
233
+ b, *_, device = *hubert.shape, hubert.device
234
+
235
+ if not infer:
236
+ Batch2Loss.module4(
237
+ self.p_losses,
238
+ self.norm_spec(ref_mels), cond, ret, self.K_step, b, device
239
+ )
240
+ else:
241
+ if 'use_gt_mel' in kwargs.keys() and kwargs['use_gt_mel']:
242
+ t = kwargs['add_noise_step']
243
+ print('===>using ground truth mel as start, please make sure parameter "key==0" !')
244
+ fs2_mels = ref_mels
245
+ fs2_mels = self.norm_spec(fs2_mels)
246
+ fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
247
+ x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
248
+ else:
249
+ t = self.K_step
250
+ shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
251
+ x = torch.randn(shape, device=device)
252
+ if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1:
253
+ self.noise_list = deque(maxlen=4)
254
+ iteration_interval = hparams['pndm_speedup']
255
+ for i in tqdm(reversed(range(0, t, iteration_interval)), desc='sample time step',
256
+ total=t // iteration_interval):
257
+ x = self.p_sample_plms(x, torch.full((b,), i, device=device, dtype=torch.long), iteration_interval,
258
+ cond)
259
+ else:
260
+ for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
261
+ x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
262
+ x = x[:, 0].transpose(1, 2)
263
+ if mel2ph is not None: # for singing
264
+ ret['mel_out'] = self.denorm_spec(x) * ((mel2ph > 0).float()[:, :, None])
265
+ else:
266
+ ret['mel_out'] = self.denorm_spec(x)
267
+ return ret
268
+
269
+ def norm_spec(self, x):
270
+ return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
271
+
272
+ def denorm_spec(self, x):
273
+ return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
274
+
275
+ def out2mel(self, x):
276
+ return x
277
+
278
+
279
+ class OfflineGaussianDiffusion(GaussianDiffusion):
280
+ def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
281
+ ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
282
+ b, *_, device = *txt_tokens.shape, txt_tokens.device
283
+
284
+ ret = self.fs2(txt_tokens, mel2ph, spk_embed, ref_mels, f0, uv, energy,
285
+ skip_decoder=True, infer=True, **kwargs)
286
+ cond = ret['decoder_inp'].transpose(1, 2)
287
+ fs2_mels = ref_mels[1]
288
+ ref_mels = ref_mels[0]
289
+
290
+ if not infer:
291
+ t = torch.randint(0, self.K_step, (b,), device=device).long()
292
+ x = ref_mels
293
+ x = self.norm_spec(x)
294
+ x = x.transpose(1, 2)[:, None, :, :] # [B, 1, M, T]
295
+ ret['diff_loss'] = self.p_losses(x, t, cond)
296
+ else:
297
+ t = self.K_step
298
+ fs2_mels = self.norm_spec(fs2_mels)
299
+ fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
300
+
301
+ x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
302
+
303
+ if hparams.get('gaussian_start') is not None and hparams['gaussian_start']:
304
+ print('===> gaussion start.')
305
+ shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
306
+ x = torch.randn(shape, device=device)
307
+ for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
308
+ x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
309
+ x = x[:, 0].transpose(1, 2)
310
+ ret['mel_out'] = self.denorm_spec(x)
311
+
312
+ return ret
modules/diff/net.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from math import sqrt
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+
8
+ from modules.commons.common_layers import Mish
9
+ from utils.hparams import hparams
10
+
11
+ Linear = nn.Linear
12
+ ConvTranspose2d = nn.ConvTranspose2d
13
+
14
+
15
+ class AttrDict(dict):
16
+ def __init__(self, *args, **kwargs):
17
+ super(AttrDict, self).__init__(*args, **kwargs)
18
+ self.__dict__ = self
19
+
20
+ def override(self, attrs):
21
+ if isinstance(attrs, dict):
22
+ self.__dict__.update(**attrs)
23
+ elif isinstance(attrs, (list, tuple, set)):
24
+ for attr in attrs:
25
+ self.override(attr)
26
+ elif attrs is not None:
27
+ raise NotImplementedError
28
+ return self
29
+
30
+
31
+ class SinusoidalPosEmb(nn.Module):
32
+ def __init__(self, dim):
33
+ super().__init__()
34
+ self.dim = dim
35
+
36
+ def forward(self, x):
37
+ device = x.device
38
+ half_dim = self.dim // 2
39
+ emb = math.log(10000) / (half_dim - 1)
40
+ emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
41
+ emb = x[:, None] * emb[None, :]
42
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
43
+ return emb
44
+
45
+
46
+ def Conv1d(*args, **kwargs):
47
+ layer = nn.Conv1d(*args, **kwargs)
48
+ nn.init.kaiming_normal_(layer.weight)
49
+ return layer
50
+
51
+
52
+ @torch.jit.script
53
+ def silu(x):
54
+ return x * torch.sigmoid(x)
55
+
56
+
57
+ class ResidualBlock(nn.Module):
58
+ def __init__(self, encoder_hidden, residual_channels, dilation):
59
+ super().__init__()
60
+ self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
61
+ self.diffusion_projection = Linear(residual_channels, residual_channels)
62
+ self.conditioner_projection = Conv1d(encoder_hidden, 2 * residual_channels, 1)
63
+ self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
64
+
65
+ def forward(self, x, conditioner, diffusion_step):
66
+ diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
67
+ conditioner = self.conditioner_projection(conditioner)
68
+ y = x + diffusion_step
69
+
70
+ y = self.dilated_conv(y) + conditioner
71
+
72
+ gate, filter = torch.chunk(y, 2, dim=1)
73
+ # Using torch.split instead of torch.chunk to avoid using onnx::Slice
74
+ # gate, filter = torch.split(y, torch.div(y.shape[1], 2), dim=1)
75
+
76
+ y = torch.sigmoid(gate) * torch.tanh(filter)
77
+
78
+ y = self.output_projection(y)
79
+ residual, skip = torch.chunk(y, 2, dim=1)
80
+ # Using torch.split instead of torch.chunk to avoid using onnx::Slice
81
+ # residual, skip = torch.split(y, torch.div(y.shape[1], 2), dim=1)
82
+
83
+ return (x + residual) / sqrt(2.0), skip
84
+
85
+
86
+ class DiffNet(nn.Module):
87
+ def __init__(self, in_dims=80):
88
+ super().__init__()
89
+ self.params = params = AttrDict(
90
+ # Model params
91
+ encoder_hidden=hparams['hidden_size'],
92
+ residual_layers=hparams['residual_layers'],
93
+ residual_channels=hparams['residual_channels'],
94
+ dilation_cycle_length=hparams['dilation_cycle_length'],
95
+ )
96
+ self.input_projection = Conv1d(in_dims, params.residual_channels, 1)
97
+ self.diffusion_embedding = SinusoidalPosEmb(params.residual_channels)
98
+ dim = params.residual_channels
99
+ self.mlp = nn.Sequential(
100
+ nn.Linear(dim, dim * 4),
101
+ Mish(),
102
+ nn.Linear(dim * 4, dim)
103
+ )
104
+ self.residual_layers = nn.ModuleList([
105
+ ResidualBlock(params.encoder_hidden, params.residual_channels, 2 ** (i % params.dilation_cycle_length))
106
+ for i in range(params.residual_layers)
107
+ ])
108
+ self.skip_projection = Conv1d(params.residual_channels, params.residual_channels, 1)
109
+ self.output_projection = Conv1d(params.residual_channels, in_dims, 1)
110
+ nn.init.zeros_(self.output_projection.weight)
111
+
112
+ def forward(self, spec, diffusion_step, cond):
113
+ """
114
+
115
+ :param spec: [B, 1, M, T]
116
+ :param diffusion_step: [B, 1]
117
+ :param cond: [B, M, T]
118
+ :return:
119
+ """
120
+ x = spec[:, 0]
121
+ x = self.input_projection(x) # x [B, residual_channel, T]
122
+
123
+ x = F.relu(x)
124
+ diffusion_step = self.diffusion_embedding(diffusion_step)
125
+ diffusion_step = self.mlp(diffusion_step)
126
+ skip = []
127
+ for layer_id, layer in enumerate(self.residual_layers):
128
+ x, skip_connection = layer(x, cond, diffusion_step)
129
+ skip.append(skip_connection)
130
+
131
+ x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
132
+ x = self.skip_projection(x)
133
+ x = F.relu(x)
134
+ x = self.output_projection(x) # [B, 80, T]
135
+ return x[:, None, :, :]
modules/encoder.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from modules.commons.common_layers import *
4
+ from modules.commons.common_layers import Embedding
5
+ from modules.commons.common_layers import SinusoidalPositionalEmbedding
6
+ from utils.hparams import hparams
7
+ from utils.pitch_utils import f0_to_coarse, denorm_f0
8
+
9
+
10
+ class LayerNorm(torch.nn.LayerNorm):
11
+ """Layer normalization module.
12
+ :param int nout: output dim size
13
+ :param int dim: dimension to be normalized
14
+ """
15
+
16
+ def __init__(self, nout, dim=-1):
17
+ """Construct an LayerNorm object."""
18
+ super(LayerNorm, self).__init__(nout, eps=1e-12)
19
+ self.dim = dim
20
+
21
+ def forward(self, x):
22
+ """Apply layer normalization.
23
+ :param torch.Tensor x: input tensor
24
+ :return: layer normalized tensor
25
+ :rtype torch.Tensor
26
+ """
27
+ if self.dim == -1:
28
+ return super(LayerNorm, self).forward(x)
29
+ return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
30
+
31
+
32
+ class PitchPredictor(torch.nn.Module):
33
+ def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5,
34
+ dropout_rate=0.1, padding='SAME'):
35
+ """Initilize pitch predictor module.
36
+ Args:
37
+ idim (int): Input dimension.
38
+ n_layers (int, optional): Number of convolutional layers.
39
+ n_chans (int, optional): Number of channels of convolutional layers.
40
+ kernel_size (int, optional): Kernel size of convolutional layers.
41
+ dropout_rate (float, optional): Dropout rate.
42
+ """
43
+ super(PitchPredictor, self).__init__()
44
+ self.conv = torch.nn.ModuleList()
45
+ self.kernel_size = kernel_size
46
+ self.padding = padding
47
+ for idx in range(n_layers):
48
+ in_chans = idim if idx == 0 else n_chans
49
+ self.conv += [torch.nn.Sequential(
50
+ torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
51
+ if padding == 'SAME'
52
+ else (kernel_size - 1, 0), 0),
53
+ torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
54
+ torch.nn.ReLU(),
55
+ LayerNorm(n_chans, dim=1),
56
+ torch.nn.Dropout(dropout_rate)
57
+ )]
58
+ self.linear = torch.nn.Linear(n_chans, odim)
59
+ self.embed_positions = SinusoidalPositionalEmbedding(idim, 0, init_size=4096)
60
+ self.pos_embed_alpha = nn.Parameter(torch.Tensor([1]))
61
+
62
+ def forward(self, xs):
63
+ """
64
+
65
+ :param xs: [B, T, H]
66
+ :return: [B, T, H]
67
+ """
68
+ positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
69
+ xs = xs + positions
70
+ xs = xs.transpose(1, -1) # (B, idim, Tmax)
71
+ for f in self.conv:
72
+ xs = f(xs) # (B, C, Tmax)
73
+ # NOTE: calculate in log domain
74
+ xs = self.linear(xs.transpose(1, -1)) # (B, Tmax, H)
75
+ return xs
76
+
77
+
78
+ class SvcEncoder(nn.Module):
79
+ def __init__(self, dictionary, out_dims=None):
80
+ super().__init__()
81
+ # self.dictionary = dictionary
82
+ self.padding_idx = 0
83
+ self.hidden_size = hparams['hidden_size']
84
+ self.out_dims = out_dims
85
+ if out_dims is None:
86
+ self.out_dims = hparams['audio_num_mel_bins']
87
+ self.mel_out = Linear(self.hidden_size, self.out_dims, bias=True)
88
+ predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size
89
+ if hparams['use_pitch_embed']:
90
+ self.pitch_embed = Embedding(300, self.hidden_size, self.padding_idx)
91
+ self.pitch_predictor = PitchPredictor(
92
+ self.hidden_size,
93
+ n_chans=predictor_hidden,
94
+ n_layers=hparams['predictor_layers'],
95
+ dropout_rate=hparams['predictor_dropout'],
96
+ odim=2 if hparams['pitch_type'] == 'frame' else 1,
97
+ padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel'])
98
+ if hparams['use_energy_embed']:
99
+ self.energy_embed = Embedding(256, self.hidden_size, self.padding_idx)
100
+ if hparams['use_spk_id']:
101
+ self.spk_embed_proj = Embedding(hparams['num_spk'], self.hidden_size)
102
+ if hparams['use_split_spk_id']:
103
+ self.spk_embed_f0 = Embedding(hparams['num_spk'], self.hidden_size)
104
+ self.spk_embed_dur = Embedding(hparams['num_spk'], self.hidden_size)
105
+ elif hparams['use_spk_embed']:
106
+ self.spk_embed_proj = Linear(256, self.hidden_size, bias=True)
107
+
108
+ def forward(self, hubert, mel2ph=None, spk_embed=None,
109
+ ref_mels=None, f0=None, uv=None, energy=None, skip_decoder=True,
110
+ spk_embed_dur_id=None, spk_embed_f0_id=None, infer=False, **kwargs):
111
+ ret = {}
112
+ encoder_out = hubert
113
+ src_nonpadding = (hubert != 0).any(-1)[:, :, None]
114
+
115
+ # add ref style embed
116
+ # Not implemented
117
+ # variance encoder
118
+ var_embed = 0
119
+
120
+ # encoder_out_dur denotes encoder outputs for duration predictor
121
+ # in speech adaptation, duration predictor use old speaker embedding
122
+ if hparams['use_spk_embed']:
123
+ spk_embed_dur = spk_embed_f0 = spk_embed = self.spk_embed_proj(spk_embed)[:, None, :]
124
+ elif hparams['use_spk_id']:
125
+ spk_embed_id = spk_embed
126
+ if spk_embed_dur_id is None:
127
+ spk_embed_dur_id = spk_embed_id
128
+ if spk_embed_f0_id is None:
129
+ spk_embed_f0_id = spk_embed_id
130
+ spk_embed_0 = self.spk_embed_proj(spk_embed_id.to(hubert.device))[:, None, :]
131
+ spk_embed_1 = self.spk_embed_proj(torch.LongTensor([0]).to(hubert.device))[:, None, :]
132
+ spk_embed_2 = self.spk_embed_proj(torch.LongTensor([0]).to(hubert.device))[:, None, :]
133
+ spk_embed = 1 * spk_embed_0 + 0 * spk_embed_1 + 0 * spk_embed_2
134
+ spk_embed_dur = spk_embed_f0 = spk_embed
135
+ if hparams['use_split_spk_id']:
136
+ spk_embed_dur = self.spk_embed_dur(spk_embed_dur_id)[:, None, :]
137
+ spk_embed_f0 = self.spk_embed_f0(spk_embed_f0_id)[:, None, :]
138
+ else:
139
+ spk_embed_dur = spk_embed_f0 = spk_embed = 0
140
+
141
+ ret['mel2ph'] = mel2ph
142
+
143
+ decoder_inp = F.pad(encoder_out, [0, 0, 1, 0])
144
+
145
+ mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]])
146
+ decoder_inp_origin = decoder_inp = torch.gather(decoder_inp, 1, mel2ph_) # [B, T, H]
147
+
148
+ tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
149
+
150
+ # add pitch and energy embed
151
+ pitch_inp = (decoder_inp_origin + var_embed + spk_embed_f0) * tgt_nonpadding
152
+ if hparams['use_pitch_embed']:
153
+ pitch_inp_ph = (encoder_out + var_embed + spk_embed_f0) * src_nonpadding
154
+ decoder_inp = decoder_inp + self.add_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out=pitch_inp_ph)
155
+ if hparams['use_energy_embed']:
156
+ decoder_inp = decoder_inp + self.add_energy(pitch_inp, energy, ret)
157
+
158
+ ret['decoder_inp'] = decoder_inp = (decoder_inp + spk_embed) * tgt_nonpadding
159
+ return ret
160
+
161
+ def add_dur(self, dur_input, mel2ph, hubert, ret):
162
+ src_padding = (hubert == 0).all(-1)
163
+ dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
164
+ if mel2ph is None:
165
+ dur, xs = self.dur_predictor.inference(dur_input, src_padding)
166
+ ret['dur'] = xs
167
+ ret['dur_choice'] = dur
168
+ mel2ph = self.length_regulator(dur, src_padding).detach()
169
+ else:
170
+ ret['dur'] = self.dur_predictor(dur_input, src_padding)
171
+ ret['mel2ph'] = mel2ph
172
+ return mel2ph
173
+
174
+ def run_decoder(self, decoder_inp, tgt_nonpadding, ret, infer, **kwargs):
175
+ x = decoder_inp # [B, T, H]
176
+ x = self.mel_out(x)
177
+ return x * tgt_nonpadding
178
+
179
+ def out2mel(self, out):
180
+ return out
181
+
182
+ def add_pitch(self, decoder_inp, f0, uv, mel2ph, ret, encoder_out=None):
183
+ decoder_inp = decoder_inp.detach() + hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
184
+
185
+ pitch_padding = (mel2ph == 0)
186
+ ret['f0_denorm'] = f0_denorm = denorm_f0(f0, uv, hparams, pitch_padding=pitch_padding)
187
+ if pitch_padding is not None:
188
+ f0[pitch_padding] = 0
189
+
190
+ pitch = f0_to_coarse(f0_denorm, hparams) # start from 0
191
+ ret['pitch_pred'] = pitch.unsqueeze(-1)
192
+ pitch_embedding = self.pitch_embed(pitch)
193
+ return pitch_embedding
194
+
195
+ def add_energy(self, decoder_inp, energy, ret):
196
+ decoder_inp = decoder_inp.detach() + hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
197
+ ret['energy_pred'] = energy # energy_pred = self.energy_predictor(decoder_inp)[:, :, 0]
198
+ energy = torch.clamp(energy * 256 // 4, max=255).long() # energy_to_coarse
199
+ energy_embedding = self.energy_embed(energy)
200
+ return energy_embedding
201
+
202
+ @staticmethod
203
+ def mel_norm(x):
204
+ return (x + 5.5) / (6.3 / 2) - 1
205
+
206
+ @staticmethod
207
+ def mel_denorm(x):
208
+ return (x + 1) * (6.3 / 2) - 5.5
modules/hubert/__pycache__/cn_hubert.cpython-38.pyc ADDED
Binary file (1.32 kB). View file
 
modules/hubert/__pycache__/hubert_model.cpython-38.pyc ADDED
Binary file (8.38 kB). View file
 
modules/hubert/__pycache__/hubert_onnx.cpython-38.pyc ADDED
Binary file (735 Bytes). View file
 
modules/hubert/cn_hubert.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ import torch.nn as nn
4
+
5
+
6
+ def load_cn_model(ch_hubert_path):
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+ from fairseq import checkpoint_utils
9
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
10
+ [ch_hubert_path],
11
+ suffix="",
12
+ )
13
+ model = models[0]
14
+ model = model.to(device)
15
+ model.eval()
16
+ return model
17
+
18
+
19
+ def get_cn_hubert_units(con_model, audio_path, dev):
20
+ audio, sampling_rate = librosa.load(audio_path)
21
+ if len(audio.shape) > 1:
22
+ audio = librosa.to_mono(audio.transpose(1, 0))
23
+ if sampling_rate != 16000:
24
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
25
+
26
+ feats = torch.from_numpy(audio).float()
27
+ if feats.dim() == 2: # double channels
28
+ feats = feats.mean(-1)
29
+ assert feats.dim() == 1, feats.dim()
30
+ feats = feats.view(1, -1)
31
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
32
+ inputs = {
33
+ "source": feats.to(dev),
34
+ "padding_mask": padding_mask.to(dev),
35
+ "output_layer": 9, # layer 9
36
+ }
37
+ with torch.no_grad():
38
+ logits = con_model.extract_features(**inputs)
39
+ feats = con_model.final_proj(logits[0])
40
+ return feats
modules/hubert/hubert_model.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import random
3
+ from typing import Optional, Tuple
4
+
5
+ import librosa
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as t_func
9
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
10
+
11
+
12
+ class Hubert(nn.Module):
13
+ def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
14
+ super().__init__()
15
+ self._mask = mask
16
+ self.feature_extractor = FeatureExtractor()
17
+ self.feature_projection = FeatureProjection()
18
+ self.positional_embedding = PositionalConvEmbedding()
19
+ self.norm = nn.LayerNorm(768)
20
+ self.dropout = nn.Dropout(0.1)
21
+ self.encoder = TransformerEncoder(
22
+ nn.TransformerEncoderLayer(
23
+ 768, 12, 3072, activation="gelu", batch_first=True
24
+ ),
25
+ 12,
26
+ )
27
+ self.proj = nn.Linear(768, 256)
28
+
29
+ self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
30
+ self.label_embedding = nn.Embedding(num_label_embeddings, 256)
31
+
32
+ def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
33
+ mask = None
34
+ if self.training and self._mask:
35
+ mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
36
+ x[mask] = self.masked_spec_embed.to(x.dtype)
37
+ return x, mask
38
+
39
+ def encode(
40
+ self, x: torch.Tensor, layer: Optional[int] = None
41
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
42
+ x = self.feature_extractor(x)
43
+ x = self.feature_projection(x.transpose(1, 2))
44
+ x, mask = self.mask(x)
45
+ x = x + self.positional_embedding(x)
46
+ x = self.dropout(self.norm(x))
47
+ x = self.encoder(x, output_layer=layer)
48
+ return x, mask
49
+
50
+ def logits(self, x: torch.Tensor) -> torch.Tensor:
51
+ logits = torch.cosine_similarity(
52
+ x.unsqueeze(2),
53
+ self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
54
+ dim=-1,
55
+ )
56
+ return logits / 0.1
57
+
58
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
59
+ x, mask = self.encode(x)
60
+ x = self.proj(x)
61
+ logits = self.logits(x)
62
+ return logits, mask
63
+
64
+
65
+ class HubertSoft(Hubert):
66
+ def __init__(self):
67
+ super().__init__()
68
+
69
+ # @torch.inference_mode()
70
+ def units(self, wav: torch.Tensor) -> torch.Tensor:
71
+ wav = torch.nn.functional.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
72
+ x, _ = self.encode(wav)
73
+ return self.proj(x)
74
+
75
+ def forward(self, wav: torch.Tensor):
76
+ return self.units(wav)
77
+
78
+
79
+ class FeatureExtractor(nn.Module):
80
+ def __init__(self):
81
+ super().__init__()
82
+ self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
83
+ self.norm0 = nn.GroupNorm(512, 512)
84
+ self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
85
+ self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
86
+ self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
87
+ self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
88
+ self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
89
+ self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
90
+
91
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
92
+ x = t_func.gelu(self.norm0(self.conv0(x)))
93
+ x = t_func.gelu(self.conv1(x))
94
+ x = t_func.gelu(self.conv2(x))
95
+ x = t_func.gelu(self.conv3(x))
96
+ x = t_func.gelu(self.conv4(x))
97
+ x = t_func.gelu(self.conv5(x))
98
+ x = t_func.gelu(self.conv6(x))
99
+ return x
100
+
101
+
102
+ class FeatureProjection(nn.Module):
103
+ def __init__(self):
104
+ super().__init__()
105
+ self.norm = nn.LayerNorm(512)
106
+ self.projection = nn.Linear(512, 768)
107
+ self.dropout = nn.Dropout(0.1)
108
+
109
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
110
+ x = self.norm(x)
111
+ x = self.projection(x)
112
+ x = self.dropout(x)
113
+ return x
114
+
115
+
116
+ class PositionalConvEmbedding(nn.Module):
117
+ def __init__(self):
118
+ super().__init__()
119
+ self.conv = nn.Conv1d(
120
+ 768,
121
+ 768,
122
+ kernel_size=128,
123
+ padding=128 // 2,
124
+ groups=16,
125
+ )
126
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
127
+
128
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
129
+ x = self.conv(x.transpose(1, 2))
130
+ x = t_func.gelu(x[:, :, :-1])
131
+ return x.transpose(1, 2)
132
+
133
+
134
+ class TransformerEncoder(nn.Module):
135
+ def __init__(
136
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
137
+ ) -> None:
138
+ super(TransformerEncoder, self).__init__()
139
+ self.layers = nn.ModuleList(
140
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
141
+ )
142
+ self.num_layers = num_layers
143
+
144
+ def forward(
145
+ self,
146
+ src: torch.Tensor,
147
+ mask: torch.Tensor = None,
148
+ src_key_padding_mask: torch.Tensor = None,
149
+ output_layer: Optional[int] = None,
150
+ ) -> torch.Tensor:
151
+ output = src
152
+ for layer in self.layers[:output_layer]:
153
+ output = layer(
154
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
155
+ )
156
+ return output
157
+
158
+
159
+ def _compute_mask(
160
+ shape: Tuple[int, int],
161
+ mask_prob: float,
162
+ mask_length: int,
163
+ device: torch.device,
164
+ min_masks: int = 0,
165
+ ) -> torch.Tensor:
166
+ batch_size, sequence_length = shape
167
+
168
+ if mask_length < 1:
169
+ raise ValueError("`mask_length` has to be bigger than 0.")
170
+
171
+ if mask_length > sequence_length:
172
+ raise ValueError(
173
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
174
+ )
175
+
176
+ # compute number of masked spans in batch
177
+ num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
178
+ num_masked_spans = max(num_masked_spans, min_masks)
179
+
180
+ # make sure num masked indices <= sequence_length
181
+ if num_masked_spans * mask_length > sequence_length:
182
+ num_masked_spans = sequence_length // mask_length
183
+
184
+ # SpecAugment mask to fill
185
+ mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
186
+
187
+ # uniform distribution to sample from, make sure that offset samples are < sequence_length
188
+ uniform_dist = torch.ones(
189
+ (batch_size, sequence_length - (mask_length - 1)), device=device
190
+ )
191
+
192
+ # get random indices to mask
193
+ mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
194
+
195
+ # expand masked indices to masked spans
196
+ mask_indices = (
197
+ mask_indices.unsqueeze(dim=-1)
198
+ .expand((batch_size, num_masked_spans, mask_length))
199
+ .reshape(batch_size, num_masked_spans * mask_length)
200
+ )
201
+ offsets = (
202
+ torch.arange(mask_length, device=device)[None, None, :]
203
+ .expand((batch_size, num_masked_spans, mask_length))
204
+ .reshape(batch_size, num_masked_spans * mask_length)
205
+ )
206
+ mask_idxs = mask_indices + offsets
207
+
208
+ # scatter indices to mask
209
+ mask = mask.scatter(1, mask_idxs, True)
210
+
211
+ return mask
212
+
213
+
214
+ def hubert_soft(
215
+ path: str
216
+ ) -> HubertSoft:
217
+ r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
218
+ Args:
219
+ path (str): path of a pretrained model
220
+ """
221
+ dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
222
+ hubert = HubertSoft()
223
+ checkpoint = torch.load(path, map_location="cpu")
224
+ consume_prefix_in_state_dict_if_present(checkpoint, "module.")
225
+ hubert.load_state_dict(checkpoint)
226
+ hubert.eval().to(dev)
227
+ return hubert
228
+
229
+
230
+ def get_units(hbt_soft, raw_wav_path, dev=torch.device('cuda')):
231
+ wav, sr = librosa.load(raw_wav_path, sr=None)
232
+ assert (sr >= 16000)
233
+ if len(wav.shape) > 1:
234
+ wav = librosa.to_mono(wav)
235
+ if sr != 16000:
236
+ wav16 = librosa.resample(wav, sr, 16000)
237
+ else:
238
+ wav16 = wav
239
+ dev = torch.device("cuda" if (dev == torch.device('cuda') and torch.cuda.is_available()) else "cpu")
240
+ torch.cuda.is_available() and torch.cuda.empty_cache()
241
+ with torch.inference_mode():
242
+ units = hbt_soft.units(torch.FloatTensor(wav16.astype(float)).unsqueeze(0).unsqueeze(0).to(dev))
243
+ return units
modules/hubert/hubert_onnx.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import torch
4
+ import torchaudio
5
+
6
+
7
+ def get_onnx_units(hbt_soft, raw_wav_path):
8
+ source, sr = torchaudio.load(raw_wav_path)
9
+ source = torchaudio.functional.resample(source, sr, 16000)
10
+ if len(source.shape) == 2 and source.shape[1] >= 2:
11
+ source = torch.mean(source, dim=0).unsqueeze(0)
12
+ source = source.unsqueeze(0)
13
+ # 使用ONNX Runtime进行推理
14
+ start = time.time()
15
+ units = hbt_soft.run(output_names=["units"],
16
+ input_feed={"wav": source.numpy()})[0]
17
+ use_time = time.time() - start
18
+ print("hubert_onnx_session.run time:{}".format(use_time))
19
+ return units
modules/nsf_hifigan/__pycache__/env.cpython-310.pyc ADDED
Binary file (813 Bytes). View file
 
modules/nsf_hifigan/__pycache__/env.cpython-38.pyc ADDED
Binary file (799 Bytes). View file
 
modules/nsf_hifigan/__pycache__/models.cpython-310.pyc ADDED
Binary file (16.1 kB). View file
 
modules/nsf_hifigan/__pycache__/models.cpython-38.pyc ADDED
Binary file (16.3 kB). View file
 
modules/nsf_hifigan/__pycache__/nvSTFT.cpython-310.pyc ADDED
Binary file (3.78 kB). View file
 
modules/nsf_hifigan/__pycache__/nvSTFT.cpython-38.pyc ADDED
Binary file (3.84 kB). View file