hzrr commited on
Commit
2b37c27
·
1 Parent(s): 62f6e75
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -32
  2. LICENSE +0 -21
  3. README.md +4 -4
  4. app.py +60 -66
  5. attentions.py +294 -286
  6. commons.py +99 -100
  7. configs/yilanqiu.json → config.json +15 -10
  8. configs/nyarumul.json +0 -53
  9. configs/nyarusing.json +0 -52
  10. data.py +0 -36
  11. data_utils.py +12 -14
  12. hubert/__init__.py +0 -8
  13. hubert/__pycache__/__init__.cpython-38.pyc +0 -0
  14. hubert/__pycache__/model.cpython-38.pyc +0 -0
  15. hubert/dataset.py +0 -91
  16. hubert/utils.py +0 -58
  17. hubert/model.py → hubert_model.py +25 -91
  18. icassp2022_vocal_transcription/.gitignore +0 -3
  19. icassp2022_vocal_transcription/README.md +0 -56
  20. icassp2022_vocal_transcription/__init__.py +0 -3
  21. icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc +0 -0
  22. icassp2022_vocal_transcription/data/weight_ST.hdf5 +0 -3
  23. icassp2022_vocal_transcription/data/x_train_mean.npy +0 -3
  24. icassp2022_vocal_transcription/data/x_train_std.npy +0 -3
  25. icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png +0 -0
  26. icassp2022_vocal_transcription/img/example_pop1_midi.png +0 -0
  27. icassp2022_vocal_transcription/requirements.txt +0 -8
  28. icassp2022_vocal_transcription/src/MIDI.py +0 -141
  29. icassp2022_vocal_transcription/src/__init__.py +0 -0
  30. icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc +0 -0
  31. icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc +0 -0
  32. icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc +0 -0
  33. icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc +0 -0
  34. icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc +0 -0
  35. icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc +0 -0
  36. icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc +0 -0
  37. icassp2022_vocal_transcription/src/featureExtraction.py +0 -61
  38. icassp2022_vocal_transcription/src/model.py +0 -139
  39. icassp2022_vocal_transcription/src/quantization.py +0 -217
  40. icassp2022_vocal_transcription/src/singing_transcription.py +0 -147
  41. icassp2022_vocal_transcription/src/utils.py +0 -49
  42. infer_tool.py +132 -57
  43. models.py +9 -15
  44. modules.py +282 -284
  45. preprocess_wave.py +118 -0
  46. requirements.txt +7 -7
  47. text/LICENSE +0 -19
  48. text/__init__.py +0 -54
  49. text/cleaners.py +0 -100
  50. text/symbols.py +0 -16
.gitattributes DELETED
@@ -1,32 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ftz filter=lfs diff=lfs merge=lfs -text
6
- *.gz filter=lfs diff=lfs merge=lfs -text
7
- *.h5 filter=lfs diff=lfs merge=lfs -text
8
- *.joblib filter=lfs diff=lfs merge=lfs -text
9
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
- *.model filter=lfs diff=lfs merge=lfs -text
11
- *.msgpack filter=lfs diff=lfs merge=lfs -text
12
- *.npy filter=lfs diff=lfs merge=lfs -text
13
- *.npz filter=lfs diff=lfs merge=lfs -text
14
- *.onnx filter=lfs diff=lfs merge=lfs -text
15
- *.ot filter=lfs diff=lfs merge=lfs -text
16
- *.parquet filter=lfs diff=lfs merge=lfs -text
17
- *.pickle filter=lfs diff=lfs merge=lfs -text
18
- *.pkl filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pt filter=lfs diff=lfs merge=lfs -text
21
- *.pth filter=lfs diff=lfs merge=lfs -text
22
- *.rar filter=lfs diff=lfs merge=lfs -text
23
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
- *.tar.* filter=lfs diff=lfs merge=lfs -text
25
- *.tflite filter=lfs diff=lfs merge=lfs -text
26
- *.tgz filter=lfs diff=lfs merge=lfs -text
27
- *.wasm filter=lfs diff=lfs merge=lfs -text
28
- *.xz filter=lfs diff=lfs merge=lfs -text
29
- *.zip filter=lfs diff=lfs merge=lfs -text
30
- *.zst filter=lfs diff=lfs merge=lfs -text
31
- *tfevents* filter=lfs diff=lfs merge=lfs -text
32
- icassp2022_vocal_transcription/data/weight_ST.hdf5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2021 Jaehyeon Kim
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Sovits Midi Dev
3
- emoji: 🐨
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.4
8
  app_file: app.py
 
1
  ---
2
+ title: Sovits F0
3
+ emoji: 🚀
4
+ colorFrom: purple
5
+ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 3.4
8
  app_file: app.py
app.py CHANGED
@@ -1,77 +1,45 @@
1
- import logging
2
 
3
  import gradio as gr
 
4
  import torch
5
- import torchaudio
6
 
7
- import hubert
8
- import icassp2022_vocal_transcription
9
  import infer_tool
10
- import utils
11
- from models import SynthesizerTrn
12
 
13
- dev = torch.device("cpu")
14
- numba_logger = logging.getLogger('numba')
15
- numba_logger.setLevel(logging.WARNING)
16
  convert_cnt = [0]
 
 
 
 
17
 
18
- hps_ms = utils.get_hparams_from_file("configs/yilanqiu.json")
19
- net_g_ms = SynthesizerTrn(
20
- 178,
21
- hps_ms.data.filter_length // 2 + 1,
22
- hps_ms.train.segment_size // hps_ms.data.hop_length,
23
- n_speakers=hps_ms.data.n_speakers,
24
- **hps_ms.model)
25
-
26
- hubert_soft = hubert.hubert_soft('hubert.pt')
27
- _ = utils.load_checkpoint("1121_epochs.pth", net_g_ms, None)
28
- _ = net_g_ms.eval().to(dev)
29
 
30
 
31
  def vc_fn(sid, audio_record, audio_upload, tran):
 
32
  if audio_upload is not None:
33
  audio_path = audio_upload
34
  elif audio_record is not None:
35
  audio_path = audio_record
36
  else:
37
- return "你需要上传wav文件或自行录音", None
38
- target_sample = hps_ms.data.sampling_rate
39
- audio_path = infer_tool.wav_resample(audio_path, target_sample)
40
- audio, sampling_rate = torchaudio.load(audio_path)
41
  duration = audio.shape[0] / sampling_rate
42
- if duration > 45:
43
- return "请上传小于45s的音频,需要转换长音频请使用colab", None
44
-
45
- soft = infer_tool.get_units(audio_path, hubert_soft).squeeze(0).cpu().numpy()
46
-
47
- pitch = icassp2022_vocal_transcription.transcribe(audio_path)
48
- pitch[pitch != 0] = pitch[pitch != 0] + tran
49
- if tran == 100:
50
- pitch[:] = 0
51
- pitch = infer_tool.resize2d_plus(pitch, len(soft[:, 0]))
52
- pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
53
-
54
- sid = torch.LongTensor([2]).to(dev) if sid == "" else torch.LongTensor([1]).to(dev)
55
- stn_tst = torch.FloatTensor(soft)
56
- with torch.no_grad():
57
- x_tst = stn_tst.unsqueeze(0).to(dev)
58
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
59
- audio = net_g_ms.infer(x_tst, x_tst_lengths, pitch=pitch, sid=sid, noise_scale=0.3,
60
- noise_scale_w=0.1, length_scale=1)[0][0, 0].data.float().cpu().numpy()
61
- convert_cnt[0] += 1
62
- print(convert_cnt[0])
63
- return "Success", (hps_ms.data.sampling_rate, audio)
64
-
65
- character_dict = {
66
- "夜刀神十香": 1,
67
- "鸢一折纸": 2,
68
- "时崎狂三": 3,
69
- "冰芽川四糸乃": 4,
70
- "五河琴里": 5,
71
- "八舞夕弦": 6,
72
- "八舞耶俱矢": 7,
73
- "诱宵美九": 8,
74
- }
75
 
76
 
77
  app = gr.Blocks()
@@ -79,26 +47,52 @@ with app:
79
  with gr.Tabs():
80
  with gr.TabItem("Basic"):
81
  gr.Markdown(value="""
82
- 本模型为sovits_midi(专供语音合成,为下面git的dev分支)
 
 
 
 
 
 
83
 
84
- 本hug仅供一键秋秋人使用(有语音授权,但是二创不要创死主播)
85
 
86
- 支持**45s以内**的**无伴奏wav格式**,或使用**网页内置**的录音(二选一),转换效果取决于源音频语气、节奏是否与目标音色相近。
87
 
88
- 如:女声歌曲转换,相似度远小于男声转换
89
 
90
- 该模型的 [github仓库链接](https://github.com/innnky/so-vits-svc)
91
 
92
- 如果想自己制作并训练模型可以访问这个 [github仓库](https://github.com/IceKyrin/sovits_guide)
93
  """)
94
- speaker_id = gr.Dropdown(label="音色", choices=list(character_dict.keys()))
95
  record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
96
  upload_input = gr.Audio(source="upload", label="上传音频(长度小于45秒)", type="filepath",
97
  elem_id="audio_inputs")
98
- vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
99
  vc_submit = gr.Button("转换", variant="primary")
100
  out_message = gr.Textbox(label="Output Message")
101
  out_audio = gr.Audio(label="Output Audio")
102
- vc_submit.click(vc_fn, [character_dict[speaker_id], record_input, upload_input, vc_transform], [out_message, out_audio])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
 
104
  app.launch()
 
1
+ import time
2
 
3
  import gradio as gr
4
+ import soundfile
5
  import torch
 
6
 
 
 
7
  import infer_tool
 
 
8
 
 
 
 
9
  convert_cnt = [0]
10
+ dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ model_name = "152_epochs.pth"
12
+ config_name = "nyarumul.json"
13
+ net_g_ms, hubert_soft, feature_input, hps_ms = infer_tool.load_model(f"{model_name}", f"configs/{config_name}")
14
 
15
+ # 获取config参数
16
+ target_sample = hps_ms.data.sampling_rate
17
+ spk_dict = {
18
+ "奕兰秋": 4
19
+ }
 
 
 
 
 
 
20
 
21
 
22
  def vc_fn(sid, audio_record, audio_upload, tran):
23
+ print(sid, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
24
  if audio_upload is not None:
25
  audio_path = audio_upload
26
  elif audio_record is not None:
27
  audio_path = audio_record
28
  else:
29
+ return "你需要上传wav文件或使用网页内置的录音!", None
30
+
31
+ audio, sampling_rate = infer_tool.format_wav(audio_path, target_sample)
 
32
  duration = audio.shape[0] / sampling_rate
33
+ if duration > 60:
34
+ return "请上传小于60s的音频,需要转换长音频请使用colab", None
35
+
36
+ o_audio, out_sr = infer_tool.infer(audio_path, spk_dict[sid], tran, net_g_ms, hubert_soft, feature_input)
37
+ out_path = f"./out_temp.wav"
38
+ soundfile.write(out_path, o_audio, target_sample)
39
+ infer_tool.f0_plt(audio_path, out_path, tran, hubert_soft, feature_input)
40
+ mistake, var = infer_tool.calc_error(audio_path, out_path, tran, feature_input)
41
+ return f"分段误差参考:0.3优秀,0.5左右合理,少量0.8-1可以接受\n若偏差过大,请调整升降半音数;多次调整均过大、说明超出歌手音域\n半音偏差:{mistake}\n半音方差:{var}", (
42
+ target_sample, o_audio), gr.Image.update("temp.jpg")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  app = gr.Blocks()
 
47
  with gr.Tabs():
48
  with gr.TabItem("Basic"):
49
  gr.Markdown(value="""
50
+ 本音源有授权,二创不创死主播即可。[其他音色体验](https://huggingface.co/spaces/innnky/nyaru-svc2.0-advanced)
51
+
52
+ 本模型为sovits_f0,支持**60s以内**的**无伴奏**wav、mp3格式,或使用**网页内置**的录音(二选一)
53
+
54
+ **error就用格式工厂自行转换为wav再上传**
55
+
56
+ 转换效果取决于源音频语气、节奏是否与目标音色相近。
57
 
58
+ 源音频为女声时,**建议降3-6key**,**最后的输出误差越接近0,音准越高**
59
 
60
+ 源音频为**低音男声**时,**建议升3key,具���看曲线图情况**
61
 
62
+ f0曲线可以直观的显示跑调情况,蓝色为输入音高,橙色为合成音频的音高
63
 
64
+ 若**只看见橙色**,说明蓝色曲线被覆盖,转换效果较好
65
 
 
66
  """)
67
+ speaker_id = gr.Dropdown(label="音色", choices=["奕兰秋"], value="奕兰秋")
68
  record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
69
  upload_input = gr.Audio(source="upload", label="上传音频(长度小于45秒)", type="filepath",
70
  elem_id="audio_inputs")
71
+ vc_transform = gr.Number(label="升降半音(整数,可以正负,半音数量,升高八度就是12)", value=0)
72
  vc_submit = gr.Button("转换", variant="primary")
73
  out_message = gr.Textbox(label="Output Message")
74
  out_audio = gr.Audio(label="Output Audio")
75
+ f0_image = gr.Image(label="f0曲线")
76
+ vc_submit.click(vc_fn, [speaker_id, record_input, upload_input, vc_transform],
77
+ [out_message, out_audio, f0_image])
78
+ with gr.TabItem("使用说明"):
79
+ gr.Markdown(value="""
80
+ 0、合集:https://github.com/IceKyrin/sovits_guide/blob/main/README.md
81
+
82
+ 1、仅支持sovit_f0(sovits2.0)模型
83
+
84
+ 2、自行下载hubert-soft-0d54a1f4.pt改名为hubert.pt(已经下好了)
85
+ https://github.com/bshall/hubert/releases/tag/v0.1
86
+
87
+ 3、pth文件夹下放置sovits2.0的模型
88
+
89
+ 4、与模型配套的xxx.json,需有speaker项——人物列表
90
+
91
+ 5、放无伴奏的音频、或网页内置录音,不要放奇奇怪怪的格式
92
+
93
+ 6、仅供交流使用,不对用户行为负责
94
+
95
+ 7、268000为44100预模型,配合sovits_pre.json;50000为22050预模型,配合nyarumul.json
96
 
97
+ """)
98
  app.launch()
attentions.py CHANGED
@@ -1,303 +1,311 @@
1
- import copy
2
  import math
3
- import numpy as np
4
  import torch
5
  from torch import nn
6
- from torch.nn import functional as F
7
 
8
  import commons
9
- import modules
10
  from modules import LayerNorm
11
-
12
 
13
  class Encoder(nn.Module):
14
- def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
15
- super().__init__()
16
- self.hidden_channels = hidden_channels
17
- self.filter_channels = filter_channels
18
- self.n_heads = n_heads
19
- self.n_layers = n_layers
20
- self.kernel_size = kernel_size
21
- self.p_dropout = p_dropout
22
- self.window_size = window_size
23
-
24
- self.drop = nn.Dropout(p_dropout)
25
- self.attn_layers = nn.ModuleList()
26
- self.norm_layers_1 = nn.ModuleList()
27
- self.ffn_layers = nn.ModuleList()
28
- self.norm_layers_2 = nn.ModuleList()
29
- for i in range(self.n_layers):
30
- self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
31
- self.norm_layers_1.append(LayerNorm(hidden_channels))
32
- self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
33
- self.norm_layers_2.append(LayerNorm(hidden_channels))
34
-
35
- def forward(self, x, x_mask):
36
- attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
37
- x = x * x_mask
38
- for i in range(self.n_layers):
39
- y = self.attn_layers[i](x, x, attn_mask)
40
- y = self.drop(y)
41
- x = self.norm_layers_1[i](x + y)
42
-
43
- y = self.ffn_layers[i](x, x_mask)
44
- y = self.drop(y)
45
- x = self.norm_layers_2[i](x + y)
46
- x = x * x_mask
47
- return x
 
 
 
48
 
49
 
50
  class Decoder(nn.Module):
51
- def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
52
- super().__init__()
53
- self.hidden_channels = hidden_channels
54
- self.filter_channels = filter_channels
55
- self.n_heads = n_heads
56
- self.n_layers = n_layers
57
- self.kernel_size = kernel_size
58
- self.p_dropout = p_dropout
59
- self.proximal_bias = proximal_bias
60
- self.proximal_init = proximal_init
61
-
62
- self.drop = nn.Dropout(p_dropout)
63
- self.self_attn_layers = nn.ModuleList()
64
- self.norm_layers_0 = nn.ModuleList()
65
- self.encdec_attn_layers = nn.ModuleList()
66
- self.norm_layers_1 = nn.ModuleList()
67
- self.ffn_layers = nn.ModuleList()
68
- self.norm_layers_2 = nn.ModuleList()
69
- for i in range(self.n_layers):
70
- self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
71
- self.norm_layers_0.append(LayerNorm(hidden_channels))
72
- self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
73
- self.norm_layers_1.append(LayerNorm(hidden_channels))
74
- self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
75
- self.norm_layers_2.append(LayerNorm(hidden_channels))
76
-
77
- def forward(self, x, x_mask, h, h_mask):
78
- """
79
- x: decoder input
80
- h: encoder output
81
- """
82
- self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
83
- encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
84
- x = x * x_mask
85
- for i in range(self.n_layers):
86
- y = self.self_attn_layers[i](x, x, self_attn_mask)
87
- y = self.drop(y)
88
- x = self.norm_layers_0[i](x + y)
89
-
90
- y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
91
- y = self.drop(y)
92
- x = self.norm_layers_1[i](x + y)
93
-
94
- y = self.ffn_layers[i](x, x_mask)
95
- y = self.drop(y)
96
- x = self.norm_layers_2[i](x + y)
97
- x = x * x_mask
98
- return x
 
 
 
 
 
99
 
100
 
101
  class MultiHeadAttention(nn.Module):
102
- def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103
- super().__init__()
104
- assert channels % n_heads == 0
105
-
106
- self.channels = channels
107
- self.out_channels = out_channels
108
- self.n_heads = n_heads
109
- self.p_dropout = p_dropout
110
- self.window_size = window_size
111
- self.heads_share = heads_share
112
- self.block_length = block_length
113
- self.proximal_bias = proximal_bias
114
- self.proximal_init = proximal_init
115
- self.attn = None
116
-
117
- self.k_channels = channels // n_heads
118
- self.conv_q = nn.Conv1d(channels, channels, 1)
119
- self.conv_k = nn.Conv1d(channels, channels, 1)
120
- self.conv_v = nn.Conv1d(channels, channels, 1)
121
- self.conv_o = nn.Conv1d(channels, out_channels, 1)
122
- self.drop = nn.Dropout(p_dropout)
123
-
124
- if window_size is not None:
125
- n_heads_rel = 1 if heads_share else n_heads
126
- rel_stddev = self.k_channels**-0.5
127
- self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128
- self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129
-
130
- nn.init.xavier_uniform_(self.conv_q.weight)
131
- nn.init.xavier_uniform_(self.conv_k.weight)
132
- nn.init.xavier_uniform_(self.conv_v.weight)
133
- if proximal_init:
134
- with torch.no_grad():
135
- self.conv_k.weight.copy_(self.conv_q.weight)
136
- self.conv_k.bias.copy_(self.conv_q.bias)
137
-
138
- def forward(self, x, c, attn_mask=None):
139
- q = self.conv_q(x)
140
- k = self.conv_k(c)
141
- v = self.conv_v(c)
142
-
143
- x, self.attn = self.attention(q, k, v, mask=attn_mask)
144
-
145
- x = self.conv_o(x)
146
- return x
147
-
148
- def attention(self, query, key, value, mask=None):
149
- # reshape [b, d, t] -> [b, n_h, t, d_k]
150
- b, d, t_s, t_t = (*key.size(), query.size(2))
151
- query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152
- key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153
- value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154
-
155
- scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156
- if self.window_size is not None:
157
- assert t_s == t_t, "Relative attention is only available for self-attention."
158
- key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159
- rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160
- scores_local = self._relative_position_to_absolute_position(rel_logits)
161
- scores = scores + scores_local
162
- if self.proximal_bias:
163
- assert t_s == t_t, "Proximal bias is only available for self-attention."
164
- scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165
- if mask is not None:
166
- scores = scores.masked_fill(mask == 0, -1e4)
167
- if self.block_length is not None:
168
- assert t_s == t_t, "Local attention is only available for self-attention."
169
- block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170
- scores = scores.masked_fill(block_mask == 0, -1e4)
171
- p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172
- p_attn = self.drop(p_attn)
173
- output = torch.matmul(p_attn, value)
174
- if self.window_size is not None:
175
- relative_weights = self._absolute_position_to_relative_position(p_attn)
176
- value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177
- output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178
- output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179
- return output, p_attn
180
-
181
- def _matmul_with_relative_values(self, x, y):
182
- """
183
- x: [b, h, l, m]
184
- y: [h or 1, m, d]
185
- ret: [b, h, l, d]
186
- """
187
- ret = torch.matmul(x, y.unsqueeze(0))
188
- return ret
189
-
190
- def _matmul_with_relative_keys(self, x, y):
191
- """
192
- x: [b, h, l, d]
193
- y: [h or 1, m, d]
194
- ret: [b, h, l, m]
195
- """
196
- ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197
- return ret
198
-
199
- def _get_relative_embeddings(self, relative_embeddings, length):
200
- max_relative_position = 2 * self.window_size + 1
201
- # Pad first before slice to avoid using cond ops.
202
- pad_length = max(length - (self.window_size + 1), 0)
203
- slice_start_position = max((self.window_size + 1) - length, 0)
204
- slice_end_position = slice_start_position + 2 * length - 1
205
- if pad_length > 0:
206
- padded_relative_embeddings = F.pad(
207
- relative_embeddings,
208
- commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209
- else:
210
- padded_relative_embeddings = relative_embeddings
211
- used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212
- return used_relative_embeddings
213
-
214
- def _relative_position_to_absolute_position(self, x):
215
- """
216
- x: [b, h, l, 2*l-1]
217
- ret: [b, h, l, l]
218
- """
219
- batch, heads, length, _ = x.size()
220
- # Concat columns of pad to shift from relative to absolute indexing.
221
- x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222
-
223
- # Concat extra elements so to add up to shape (len+1, 2*len-1).
224
- x_flat = x.view([batch, heads, length * 2 * length])
225
- x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226
-
227
- # Reshape and slice out the padded elements.
228
- x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229
- return x_final
230
-
231
- def _absolute_position_to_relative_position(self, x):
232
- """
233
- x: [b, h, l, l]
234
- ret: [b, h, l, 2*l-1]
235
- """
236
- batch, heads, length, _ = x.size()
237
- # padd along column
238
- x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239
- x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240
- # add 0's in the beginning that will skew the elements after reshape
241
- x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242
- x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243
- return x_final
244
-
245
- def _attention_bias_proximal(self, length):
246
- """Bias for self-attention to encourage attention to close positions.
247
- Args:
248
- length: an integer scalar.
249
- Returns:
250
- a Tensor with shape [1, 1, length, length]
251
- """
252
- r = torch.arange(length, dtype=torch.float32)
253
- diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254
- return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
 
255
 
256
 
257
  class FFN(nn.Module):
258
- def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259
- super().__init__()
260
- self.in_channels = in_channels
261
- self.out_channels = out_channels
262
- self.filter_channels = filter_channels
263
- self.kernel_size = kernel_size
264
- self.p_dropout = p_dropout
265
- self.activation = activation
266
- self.causal = causal
267
-
268
- if causal:
269
- self.padding = self._causal_padding
270
- else:
271
- self.padding = self._same_padding
272
-
273
- self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274
- self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275
- self.drop = nn.Dropout(p_dropout)
276
-
277
- def forward(self, x, x_mask):
278
- x = self.conv_1(self.padding(x * x_mask))
279
- if self.activation == "gelu":
280
- x = x * torch.sigmoid(1.702 * x)
281
- else:
282
- x = torch.relu(x)
283
- x = self.drop(x)
284
- x = self.conv_2(self.padding(x * x_mask))
285
- return x * x_mask
286
-
287
- def _causal_padding(self, x):
288
- if self.kernel_size == 1:
289
- return x
290
- pad_l = self.kernel_size - 1
291
- pad_r = 0
292
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293
- x = F.pad(x, commons.convert_pad_shape(padding))
294
- return x
295
-
296
- def _same_padding(self, x):
297
- if self.kernel_size == 1:
298
- return x
299
- pad_l = (self.kernel_size - 1) // 2
300
- pad_r = self.kernel_size // 2
301
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302
- x = F.pad(x, commons.convert_pad_shape(padding))
303
- return x
 
 
 
1
  import math
2
+
3
  import torch
4
  from torch import nn
5
+ from torch.nn import functional as t_func
6
 
7
  import commons
 
8
  from modules import LayerNorm
9
+
10
 
11
  class Encoder(nn.Module):
12
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4,
13
+ **kwargs):
14
+ super().__init__()
15
+ self.hidden_channels = hidden_channels
16
+ self.filter_channels = filter_channels
17
+ self.n_heads = n_heads
18
+ self.n_layers = n_layers
19
+ self.kernel_size = kernel_size
20
+ self.p_dropout = p_dropout
21
+ self.window_size = window_size
22
+
23
+ self.drop = nn.Dropout(p_dropout)
24
+ self.attn_layers = nn.ModuleList()
25
+ self.norm_layers_1 = nn.ModuleList()
26
+ self.ffn_layers = nn.ModuleList()
27
+ self.norm_layers_2 = nn.ModuleList()
28
+ for i in range(self.n_layers):
29
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
30
+ window_size=window_size))
31
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
32
+ self.ffn_layers.append(
33
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
34
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
35
+
36
+ def forward(self, x, x_mask):
37
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
38
+ x = x * x_mask
39
+ for i in range(self.n_layers):
40
+ y = self.attn_layers[i](x, x, attn_mask)
41
+ y = self.drop(y)
42
+ x = self.norm_layers_1[i](x + y)
43
+
44
+ y = self.ffn_layers[i](x, x_mask)
45
+ y = self.drop(y)
46
+ x = self.norm_layers_2[i](x + y)
47
+ x = x * x_mask
48
+ return x
49
 
50
 
51
  class Decoder(nn.Module):
52
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
53
+ proximal_bias=False, proximal_init=True, **kwargs):
54
+ super().__init__()
55
+ self.hidden_channels = hidden_channels
56
+ self.filter_channels = filter_channels
57
+ self.n_heads = n_heads
58
+ self.n_layers = n_layers
59
+ self.kernel_size = kernel_size
60
+ self.p_dropout = p_dropout
61
+ self.proximal_bias = proximal_bias
62
+ self.proximal_init = proximal_init
63
+
64
+ self.drop = nn.Dropout(p_dropout)
65
+ self.self_attn_layers = nn.ModuleList()
66
+ self.norm_layers_0 = nn.ModuleList()
67
+ self.encdec_attn_layers = nn.ModuleList()
68
+ self.norm_layers_1 = nn.ModuleList()
69
+ self.ffn_layers = nn.ModuleList()
70
+ self.norm_layers_2 = nn.ModuleList()
71
+ for i in range(self.n_layers):
72
+ self.self_attn_layers.append(
73
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
74
+ proximal_bias=proximal_bias, proximal_init=proximal_init))
75
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
76
+ self.encdec_attn_layers.append(
77
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
78
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
79
+ self.ffn_layers.append(
80
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
81
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
82
+
83
+ def forward(self, x, x_mask, h, h_mask):
84
+ """
85
+ x: decoder input
86
+ h: encoder output
87
+ """
88
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
89
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
90
+ x = x * x_mask
91
+ for i in range(self.n_layers):
92
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
93
+ y = self.drop(y)
94
+ x = self.norm_layers_0[i](x + y)
95
+
96
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
97
+ y = self.drop(y)
98
+ x = self.norm_layers_1[i](x + y)
99
+
100
+ y = self.ffn_layers[i](x, x_mask)
101
+ y = self.drop(y)
102
+ x = self.norm_layers_2[i](x + y)
103
+ x = x * x_mask
104
+ return x
105
 
106
 
107
  class MultiHeadAttention(nn.Module):
108
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True,
109
+ block_length=None, proximal_bias=False, proximal_init=False):
110
+ super().__init__()
111
+ assert channels % n_heads == 0
112
+
113
+ self.channels = channels
114
+ self.out_channels = out_channels
115
+ self.n_heads = n_heads
116
+ self.p_dropout = p_dropout
117
+ self.window_size = window_size
118
+ self.heads_share = heads_share
119
+ self.block_length = block_length
120
+ self.proximal_bias = proximal_bias
121
+ self.proximal_init = proximal_init
122
+ self.attn = None
123
+
124
+ self.k_channels = channels // n_heads
125
+ self.conv_q = nn.Conv1d(channels, channels, 1)
126
+ self.conv_k = nn.Conv1d(channels, channels, 1)
127
+ self.conv_v = nn.Conv1d(channels, channels, 1)
128
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
129
+ self.drop = nn.Dropout(p_dropout)
130
+
131
+ if window_size is not None:
132
+ n_heads_rel = 1 if heads_share else n_heads
133
+ rel_stddev = self.k_channels ** -0.5
134
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
135
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
136
+
137
+ nn.init.xavier_uniform_(self.conv_q.weight)
138
+ nn.init.xavier_uniform_(self.conv_k.weight)
139
+ nn.init.xavier_uniform_(self.conv_v.weight)
140
+ if proximal_init:
141
+ with torch.no_grad():
142
+ self.conv_k.weight.copy_(self.conv_q.weight)
143
+ self.conv_k.bias.copy_(self.conv_q.bias)
144
+
145
+ def forward(self, x, c, attn_mask=None):
146
+ q = self.conv_q(x)
147
+ k = self.conv_k(c)
148
+ v = self.conv_v(c)
149
+
150
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
151
+
152
+ x = self.conv_o(x)
153
+ return x
154
+
155
+ def attention(self, query, key, value, mask=None):
156
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
157
+ b, d, t_s, t_t = (*key.size(), query.size(2))
158
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
159
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
160
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
161
+
162
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
163
+ if self.window_size is not None:
164
+ assert t_s == t_t, "Relative attention is only available for self-attention."
165
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
166
+ rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
167
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
168
+ scores = scores + scores_local
169
+ if self.proximal_bias:
170
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
171
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
172
+ if mask is not None:
173
+ scores = scores.masked_fill(mask == 0, -1e4)
174
+ if self.block_length is not None:
175
+ assert t_s == t_t, "Local attention is only available for self-attention."
176
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
177
+ scores = scores.masked_fill(block_mask == 0, -1e4)
178
+ p_attn = t_func.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
179
+ p_attn = self.drop(p_attn)
180
+ output = torch.matmul(p_attn, value)
181
+ if self.window_size is not None:
182
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
183
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
184
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
185
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
186
+ return output, p_attn
187
+
188
+ def _matmul_with_relative_values(self, x, y):
189
+ """
190
+ x: [b, h, l, m]
191
+ y: [h or 1, m, d]
192
+ ret: [b, h, l, d]
193
+ """
194
+ ret = torch.matmul(x, y.unsqueeze(0))
195
+ return ret
196
+
197
+ def _matmul_with_relative_keys(self, x, y):
198
+ """
199
+ x: [b, h, l, d]
200
+ y: [h or 1, m, d]
201
+ ret: [b, h, l, m]
202
+ """
203
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
204
+ return ret
205
+
206
+ def _get_relative_embeddings(self, relative_embeddings, length):
207
+ max_relative_position = 2 * self.window_size + 1
208
+ # Pad first before slice to avoid using cond ops.
209
+ pad_length = max(length - (self.window_size + 1), 0)
210
+ slice_start_position = max((self.window_size + 1) - length, 0)
211
+ slice_end_position = slice_start_position + 2 * length - 1
212
+ if pad_length > 0:
213
+ padded_relative_embeddings = t_func.pad(
214
+ relative_embeddings,
215
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
216
+ else:
217
+ padded_relative_embeddings = relative_embeddings
218
+ used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
219
+ return used_relative_embeddings
220
+
221
+ def _relative_position_to_absolute_position(self, x):
222
+ """
223
+ x: [b, h, l, 2*l-1]
224
+ ret: [b, h, l, l]
225
+ """
226
+ batch, heads, length, _ = x.size()
227
+ # Concat columns of pad to shift from relative to absolute indexing.
228
+ x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
229
+
230
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
231
+ x_flat = x.view([batch, heads, length * 2 * length])
232
+ x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
233
+
234
+ # Reshape and slice out the padded elements.
235
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
236
+ return x_final
237
+
238
+ def _absolute_position_to_relative_position(self, x):
239
+ """
240
+ x: [b, h, l, l]
241
+ ret: [b, h, l, 2*l-1]
242
+ """
243
+ batch, heads, length, _ = x.size()
244
+ # padd along column
245
+ x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
246
+ x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
247
+ # add 0's in the beginning that will skew the elements after reshape
248
+ x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
249
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
250
+ return x_final
251
+
252
+ def _attention_bias_proximal(self, length):
253
+ """Bias for self-attention to encourage attention to close positions.
254
+ Args:
255
+ length: an integer scalar.
256
+ Returns:
257
+ a Tensor with shape [1, 1, length, length]
258
+ """
259
+ r = torch.arange(length, dtype=torch.float32)
260
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
261
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
262
 
263
 
264
  class FFN(nn.Module):
265
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None,
266
+ causal=False):
267
+ super().__init__()
268
+ self.in_channels = in_channels
269
+ self.out_channels = out_channels
270
+ self.filter_channels = filter_channels
271
+ self.kernel_size = kernel_size
272
+ self.p_dropout = p_dropout
273
+ self.activation = activation
274
+ self.causal = causal
275
+
276
+ if causal:
277
+ self.padding = self._causal_padding
278
+ else:
279
+ self.padding = self._same_padding
280
+
281
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
282
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
283
+ self.drop = nn.Dropout(p_dropout)
284
+
285
+ def forward(self, x, x_mask):
286
+ x = self.conv_1(self.padding(x * x_mask))
287
+ if self.activation == "gelu":
288
+ x = x * torch.sigmoid(1.702 * x)
289
+ else:
290
+ x = torch.relu(x)
291
+ x = self.drop(x)
292
+ x = self.conv_2(self.padding(x * x_mask))
293
+ return x * x_mask
294
+
295
+ def _causal_padding(self, x):
296
+ if self.kernel_size == 1:
297
+ return x
298
+ pad_l = self.kernel_size - 1
299
+ pad_r = 0
300
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
301
+ x = t_func.pad(x, commons.convert_pad_shape(padding))
302
+ return x
303
+
304
+ def _same_padding(self, x):
305
+ if self.kernel_size == 1:
306
+ return x
307
+ pad_l = (self.kernel_size - 1) // 2
308
+ pad_r = self.kernel_size // 2
309
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
310
+ x = t_func.pad(x, commons.convert_pad_shape(padding))
311
+ return x
commons.py CHANGED
@@ -1,161 +1,160 @@
1
  import math
2
- import numpy as np
3
  import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
 
7
 
8
  def init_weights(m, mean=0.0, std=0.01):
9
- classname = m.__class__.__name__
10
- if classname.find("Conv") != -1:
11
- m.weight.data.normal_(mean, std)
12
 
13
 
14
  def get_padding(kernel_size, dilation=1):
15
- return int((kernel_size*dilation - dilation)/2)
16
 
17
 
18
  def convert_pad_shape(pad_shape):
19
- l = pad_shape[::-1]
20
- pad_shape = [item for sublist in l for item in sublist]
21
- return pad_shape
22
 
23
 
24
  def intersperse(lst, item):
25
- result = [item] * (len(lst) * 2 + 1)
26
- result[1::2] = lst
27
- return result
28
 
29
 
30
  def kl_divergence(m_p, logs_p, m_q, logs_q):
31
- """KL(P||Q)"""
32
- kl = (logs_q - logs_p) - 0.5
33
- kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
34
- return kl
35
 
36
 
37
  def rand_gumbel(shape):
38
- """Sample from the Gumbel distribution, protect from overflows."""
39
- uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
- return -torch.log(-torch.log(uniform_samples))
41
 
42
 
43
  def rand_gumbel_like(x):
44
- g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
- return g
46
 
47
 
48
  def slice_segments(x, ids_str, segment_size=4):
49
- ret = torch.zeros_like(x[:, :, :segment_size])
50
- for i in range(x.size(0)):
51
- idx_str = ids_str[i]
52
- idx_end = idx_str + segment_size
53
- ret[i] = x[i, :, idx_str:idx_end]
54
- return ret
55
 
56
 
57
  def rand_slice_segments(x, x_lengths=None, segment_size=4):
58
- b, d, t = x.size()
59
- if x_lengths is None:
60
- x_lengths = t
61
- ids_str_max = x_lengths - segment_size + 1
62
- ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63
- ret = slice_segments(x, ids_str, segment_size)
64
- return ret, ids_str
65
 
66
 
67
  def get_timing_signal_1d(
68
- length, channels, min_timescale=1.0, max_timescale=1.0e4):
69
- position = torch.arange(length, dtype=torch.float)
70
- num_timescales = channels // 2
71
- log_timescale_increment = (
72
- math.log(float(max_timescale) / float(min_timescale)) /
73
- (num_timescales - 1))
74
- inv_timescales = min_timescale * torch.exp(
75
- torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
76
- scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
77
- signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
78
- signal = F.pad(signal, [0, 0, 0, channels % 2])
79
- signal = signal.view(1, channels, length)
80
- return signal
81
 
82
 
83
  def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
84
- b, channels, length = x.size()
85
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
86
- return x + signal.to(dtype=x.dtype, device=x.device)
87
 
88
 
89
  def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
90
- b, channels, length = x.size()
91
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92
- return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
93
 
94
 
95
  def subsequent_mask(length):
96
- mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
97
- return mask
98
 
99
 
100
  @torch.jit.script
101
  def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102
- n_channels_int = n_channels[0]
103
- in_act = input_a + input_b
104
- t_act = torch.tanh(in_act[:, :n_channels_int, :])
105
- s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106
- acts = t_act * s_act
107
- return acts
108
 
109
 
110
  def convert_pad_shape(pad_shape):
111
- l = pad_shape[::-1]
112
- pad_shape = [item for sublist in l for item in sublist]
113
- return pad_shape
114
 
115
 
116
  def shift_1d(x):
117
- x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118
- return x
119
 
120
 
121
  def sequence_mask(length, max_length=None):
122
- if max_length is None:
123
- max_length = length.max()
124
- x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125
- return x.unsqueeze(0) < length.unsqueeze(1)
126
 
127
 
128
  def generate_path(duration, mask):
129
- """
130
- duration: [b, 1, t_x]
131
- mask: [b, 1, t_y, t_x]
132
- """
133
- device = duration.device
134
-
135
- b, _, t_y, t_x = mask.shape
136
- cum_duration = torch.cumsum(duration, -1)
137
-
138
- cum_duration_flat = cum_duration.view(b * t_x)
139
- path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
140
- path = path.view(b, t_x, t_y)
141
- path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142
- path = path.unsqueeze(1).transpose(2,3) * mask
143
- return path
144
 
145
 
146
  def clip_grad_value_(parameters, clip_value, norm_type=2):
147
- if isinstance(parameters, torch.Tensor):
148
- parameters = [parameters]
149
- parameters = list(filter(lambda p: p.grad is not None, parameters))
150
- norm_type = float(norm_type)
151
- if clip_value is not None:
152
- clip_value = float(clip_value)
153
-
154
- total_norm = 0
155
- for p in parameters:
156
- param_norm = p.grad.data.norm(norm_type)
157
- total_norm += param_norm.item() ** norm_type
158
  if clip_value is not None:
159
- p.grad.data.clamp_(min=-clip_value, max=clip_value)
160
- total_norm = total_norm ** (1. / norm_type)
161
- return total_norm
 
 
 
 
 
 
 
 
1
  import math
2
+
3
  import torch
4
+ from torch.nn import functional as t_func
 
5
 
6
 
7
  def init_weights(m, mean=0.0, std=0.01):
8
+ classname = m.__class__.__name__
9
+ if classname.find("Conv") != -1:
10
+ m.weight.data.normal_(mean, std)
11
 
12
 
13
  def get_padding(kernel_size, dilation=1):
14
+ return int((kernel_size * dilation - dilation) / 2)
15
 
16
 
17
  def convert_pad_shape(pad_shape):
18
+ l = pad_shape[::-1]
19
+ pad_shape = [item for sublist in l for item in sublist]
20
+ return pad_shape
21
 
22
 
23
  def intersperse(lst, item):
24
+ result = [item] * (len(lst) * 2 + 1)
25
+ result[1::2] = lst
26
+ return result
27
 
28
 
29
  def kl_divergence(m_p, logs_p, m_q, logs_q):
30
+ """KL(P||Q)"""
31
+ kl = (logs_q - logs_p) - 0.5
32
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2. * logs_q)
33
+ return kl
34
 
35
 
36
  def rand_gumbel(shape):
37
+ """Sample from the Gumbel distribution, protect from overflows."""
38
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
39
+ return -torch.log(-torch.log(uniform_samples))
40
 
41
 
42
  def rand_gumbel_like(x):
43
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
44
+ return g
45
 
46
 
47
  def slice_segments(x, ids_str, segment_size=4):
48
+ ret = torch.zeros_like(x[:, :, :segment_size])
49
+ for i in range(x.size(0)):
50
+ idx_str = ids_str[i]
51
+ idx_end = idx_str + segment_size
52
+ ret[i] = x[i, :, idx_str:idx_end]
53
+ return ret
54
 
55
 
56
  def rand_slice_segments(x, x_lengths=None, segment_size=4):
57
+ b, d, t = x.size()
58
+ if x_lengths is None:
59
+ x_lengths = t
60
+ ids_str_max = x_lengths - segment_size + 1
61
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
62
+ ret = slice_segments(x, ids_str, segment_size)
63
+ return ret, ids_str
64
 
65
 
66
  def get_timing_signal_1d(
67
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
68
+ position = torch.arange(length, dtype=torch.float)
69
+ num_timescales = channels // 2
70
+ log_timescale_increment = (
71
+ math.log(float(max_timescale) / float(min_timescale)) /
72
+ (num_timescales - 1))
73
+ inv_timescales = min_timescale * torch.exp(
74
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
75
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
76
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
77
+ signal = t_func.pad(signal, [0, 0, 0, channels % 2])
78
+ signal = signal.view(1, channels, length)
79
+ return signal
80
 
81
 
82
  def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
83
+ b, channels, length = x.size()
84
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
85
+ return x + signal.to(dtype=x.dtype, device=x.device)
86
 
87
 
88
  def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
89
+ b, channels, length = x.size()
90
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
91
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
92
 
93
 
94
  def subsequent_mask(length):
95
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
96
+ return mask
97
 
98
 
99
  @torch.jit.script
100
  def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
101
+ n_channels_int = n_channels[0]
102
+ in_act = input_a + input_b
103
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
104
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
105
+ acts = t_act * s_act
106
+ return acts
107
 
108
 
109
  def convert_pad_shape(pad_shape):
110
+ l = pad_shape[::-1]
111
+ pad_shape = [item for sublist in l for item in sublist]
112
+ return pad_shape
113
 
114
 
115
  def shift_1d(x):
116
+ x = t_func.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
117
+ return x
118
 
119
 
120
  def sequence_mask(length, max_length=None):
121
+ if max_length is None:
122
+ max_length = length.max()
123
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
124
+ return x.unsqueeze(0) < length.unsqueeze(1)
125
 
126
 
127
  def generate_path(duration, mask):
128
+ """
129
+ duration: [b, 1, t_x]
130
+ mask: [b, 1, t_y, t_x]
131
+ """
132
+ device = duration.device
133
+
134
+ b, _, t_y, t_x = mask.shape
135
+ cum_duration = torch.cumsum(duration, -1)
136
+
137
+ cum_duration_flat = cum_duration.view(b * t_x)
138
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
139
+ path = path.view(b, t_x, t_y)
140
+ path = path - t_func.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
141
+ path = path.unsqueeze(1).transpose(2, 3) * mask
142
+ return path
143
 
144
 
145
  def clip_grad_value_(parameters, clip_value, norm_type=2):
146
+ if isinstance(parameters, torch.Tensor):
147
+ parameters = [parameters]
148
+ parameters = list(filter(lambda para: para.grad is not None, parameters))
149
+ norm_type = float(norm_type)
 
 
 
 
 
 
 
150
  if clip_value is not None:
151
+ clip_value = float(clip_value)
152
+
153
+ total_norm = 0
154
+ for p in parameters:
155
+ param_norm = p.grad.data.norm(norm_type)
156
+ total_norm += param_norm.item() ** norm_type
157
+ if clip_value is not None:
158
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
159
+ total_norm = total_norm ** (1. / norm_type)
160
+ return total_norm
configs/yilanqiu.json → config.json RENAMED
@@ -1,7 +1,7 @@
1
  {
2
  "train": {
3
  "log_interval": 200,
4
- "eval_interval": 2000,
5
  "seed": 1234,
6
  "epochs": 10000,
7
  "learning_rate": 2e-4,
@@ -10,7 +10,7 @@
10
  0.99
11
  ],
12
  "eps": 1e-9,
13
- "batch_size": 16,
14
  "fp16_run": true,
15
  "lr_decay": 0.999875,
16
  "segment_size": 8192,
@@ -20,8 +20,8 @@
20
  "c_kl": 1.0
21
  },
22
  "data": {
23
- "training_files": "/root/content/qiu/train.txt",
24
- "validation_files": "/root/content/qiu/val.txt",
25
  "text_cleaners": [
26
  "english_cleaners2"
27
  ],
@@ -34,10 +34,10 @@
34
  "mel_fmin": 0.0,
35
  "mel_fmax": null,
36
  "add_blank": true,
37
- "n_speakers": 3,
38
- "cleaned_text": true
39
  },
40
  "model": {
 
41
  "inter_channels": 192,
42
  "hidden_channels": 256,
43
  "filter_channels": 768,
@@ -86,8 +86,13 @@
86
  "gin_channels": 256
87
  },
88
  "speakers": [
89
- "maolei",
90
- "opencpop",
91
- "yilanqiu"
 
 
 
 
 
92
  ]
93
- }
 
1
  {
2
  "train": {
3
  "log_interval": 200,
4
+ "eval_interval": 5000,
5
  "seed": 1234,
6
  "epochs": 10000,
7
  "learning_rate": 2e-4,
 
10
  0.99
11
  ],
12
  "eps": 1e-9,
13
+ "batch_size": 32,
14
  "fp16_run": true,
15
  "lr_decay": 0.999875,
16
  "segment_size": 8192,
 
20
  "c_kl": 1.0
21
  },
22
  "data": {
23
+ "training_files": "./filelist/train.txt",
24
+ "validation_files": "./filelist/val.txt",
25
  "text_cleaners": [
26
  "english_cleaners2"
27
  ],
 
34
  "mel_fmin": 0.0,
35
  "mel_fmax": null,
36
  "add_blank": true,
37
+ "n_speakers": 8
 
38
  },
39
  "model": {
40
+ "sampling_rate": 22050,
41
  "inter_channels": 192,
42
  "hidden_channels": 256,
43
  "filter_channels": 768,
 
86
  "gin_channels": 256
87
  },
88
  "speakers": [
89
+ "zhezhi",
90
+ "kuangsan",
91
+ "sisinai",
92
+ "qinli",
93
+ "xixian",
94
+ "yejushi",
95
+ "meijiu",
96
+ "shixiang"
97
  ]
98
+ }
configs/nyarumul.json DELETED
@@ -1,53 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "eval_interval": 2000,
5
- "seed": 1234,
6
- "epochs": 10000,
7
- "learning_rate": 2e-4,
8
- "betas": [0.8, 0.99],
9
- "eps": 1e-9,
10
- "batch_size": 16,
11
- "fp16_run": true,
12
- "lr_decay": 0.999875,
13
- "segment_size": 8192,
14
- "init_lr_ratio": 1,
15
- "warmup_epochs": 0,
16
- "c_mel": 45,
17
- "c_kl": 1.0
18
- },
19
- "data": {
20
- "training_files":"/content/drive/MyDrive/SingingVC/trainmul.txt",
21
- "validation_files":"/content/drive/MyDrive/SingingVC/valmul.txt",
22
- "text_cleaners":["english_cleaners2"],
23
- "max_wav_value": 32768.0,
24
- "sampling_rate": 22050,
25
- "filter_length": 1024,
26
- "hop_length": 256,
27
- "win_length": 1024,
28
- "n_mel_channels": 80,
29
- "mel_fmin": 0.0,
30
- "mel_fmax": null,
31
- "add_blank": true,
32
- "n_speakers": 3,
33
- "cleaned_text": true
34
- },
35
- "model": {
36
- "inter_channels": 192,
37
- "hidden_channels": 256,
38
- "filter_channels": 768,
39
- "n_heads": 2,
40
- "n_layers": 6,
41
- "kernel_size": 3,
42
- "p_dropout": 0.1,
43
- "resblock": "1",
44
- "resblock_kernel_sizes": [3,7,11],
45
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
- "upsample_rates": [8,8,2,2],
47
- "upsample_initial_channel": 512,
48
- "upsample_kernel_sizes": [16,16,4,4],
49
- "n_layers_q": 3,
50
- "use_spectral_norm": false,
51
- "gin_channels": 256
52
- }
53
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/nyarusing.json DELETED
@@ -1,52 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "eval_interval": 2000,
5
- "seed": 1234,
6
- "epochs": 20000,
7
- "learning_rate": 2e-4,
8
- "betas": [0.8, 0.99],
9
- "eps": 1e-9,
10
- "batch_size": 24,
11
- "fp16_run": true,
12
- "lr_decay": 0.999875,
13
- "segment_size": 8192,
14
- "init_lr_ratio": 1,
15
- "warmup_epochs": 0,
16
- "c_mel": 45,
17
- "c_kl": 1.0
18
- },
19
- "data": {
20
- "training_files":"/content/train.txt",
21
- "validation_files":"/content/nyarusing/val.txt",
22
- "text_cleaners":["english_cleaners2"],
23
- "max_wav_value": 32768.0,
24
- "sampling_rate": 22050,
25
- "filter_length": 1024,
26
- "hop_length": 256,
27
- "win_length": 1024,
28
- "n_mel_channels": 80,
29
- "mel_fmin": 0.0,
30
- "mel_fmax": null,
31
- "add_blank": true,
32
- "n_speakers": 0,
33
- "cleaned_text": true
34
- },
35
- "model": {
36
- "inter_channels": 192,
37
- "hidden_channels": 256,
38
- "filter_channels": 768,
39
- "n_heads": 2,
40
- "n_layers": 6,
41
- "kernel_size": 3,
42
- "p_dropout": 0.1,
43
- "resblock": "1",
44
- "resblock_kernel_sizes": [3,7,11],
45
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
- "upsample_rates": [8,8,2,2],
47
- "upsample_initial_channel": 512,
48
- "upsample_kernel_sizes": [16,16,4,4],
49
- "n_layers_q": 3,
50
- "use_spectral_norm": false
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data.py DELETED
@@ -1,36 +0,0 @@
1
- import os
2
- import numpy as np
3
- import icassp2022_vocal_transcription
4
-
5
-
6
- def resize2d(source, target_len):
7
- source = source.astype(float)
8
- source[source < 0.001] = np.nan
9
- target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
10
- source)
11
- res = np.nan_to_num(target)
12
- ret = res[:].astype(int)
13
- # 若调整大小时采样到中间的点,则以上一个点作为当前音高值
14
- for i in range(len(res)):
15
- if res[i] - ret[i] > 0.001:
16
- ret[i] = ret[i - 1]
17
- return ret
18
-
19
-
20
- def get_end_file(dir_path, end):
21
- file_lists = []
22
- for root, dirs, files in os.walk(dir_path):
23
- for f_file in files:
24
- if f_file.endswith(end):
25
- file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
26
-
27
- return file_lists
28
-
29
-
30
- folder = "val"
31
- wav_paths = get_end_file(f"./qiu/wavs/{folder}/", "wav")
32
- for wav_path in wav_paths:
33
- pitch = icassp2022_vocal_transcription.transcribe(wav_path)
34
- soft = np.load(wav_path.replace("wavs", "soft").replace(".wav", ".npy"))
35
- pitch = resize2d(pitch, len(soft[:, 0]))
36
- np.save(wav_path.replace("wavs", "pitch").replace(".wav", ".npy"), pitch)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_utils.py CHANGED
@@ -1,14 +1,12 @@
1
- import time
2
  import os
3
  import random
 
4
  import numpy as np
5
  import torch
6
  import torch.utils.data
7
- import numpy as np
8
- import commons
9
  from mel_processing import spectrogram_torch
 
10
  from utils import load_wav_to_torch, load_filepaths_and_text
11
- from text import text_to_sequence, cleaned_text_to_sequence
12
 
13
 
14
  def dropout1d(myarray, ratio=0.5):
@@ -59,11 +57,11 @@ class TextAudioLoader(torch.utils.data.Dataset):
59
 
60
  def get_audio_text_pair(self, audiopath_and_text):
61
  # separate filename and text
62
- audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1],audiopath_and_text[2]
63
  text = self.get_text(text)
64
  spec, wav = self.get_audio(audiopath)
65
  pitch = self.get_pitch(pitch)
66
- return (text, spec, wav, pitch)
67
 
68
  def get_pitch(self, pitch):
69
 
@@ -99,7 +97,7 @@ class TextAudioLoader(torch.utils.data.Dataset):
99
  return len(self.audiopaths_and_text)
100
 
101
 
102
- class TextAudioCollate():
103
  """ Zero-pads model inputs and targets
104
  """
105
 
@@ -123,7 +121,6 @@ class TextAudioCollate():
123
  max_pitch_len = max([x[3].shape[0] for x in batch])
124
  # print(batch)
125
 
126
-
127
  text_lengths = torch.LongTensor(len(batch))
128
  spec_lengths = torch.LongTensor(len(batch))
129
  wav_lengths = torch.LongTensor(len(batch))
@@ -205,13 +202,14 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
205
 
206
  def get_audio_text_speaker_pair(self, audiopath_sid_text):
207
  # separate filename, speaker_id and text
208
- audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], audiopath_sid_text[3]
 
209
  text = self.get_text(text)
210
  spec, wav = self.get_audio(audiopath)
211
  sid = self.get_sid(sid)
212
  pitch = self.get_pitch(pitch)
213
 
214
- return (text, spec, wav, pitch, sid)
215
 
216
  def get_audio(self, filename):
217
  audio, sampling_rate = load_wav_to_torch(filename)
@@ -235,7 +233,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
235
  soft = np.load(text)
236
  text_norm = torch.FloatTensor(soft)
237
  return text_norm
238
-
239
  def get_pitch(self, pitch):
240
  return torch.LongTensor(np.load(pitch))
241
 
@@ -250,7 +248,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
250
  return len(self.audiopaths_sid_text)
251
 
252
 
253
- class TextAudioSpeakerCollate():
254
  """ Zero-pads model inputs and targets
255
  """
256
 
@@ -310,7 +308,7 @@ class TextAudioSpeakerCollate():
310
 
311
  if self.return_ids:
312
  return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing
313
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths,pitch_padded , sid
314
 
315
 
316
  class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
@@ -400,7 +398,7 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
400
 
401
  if hi > lo:
402
  mid = (hi + lo) // 2
403
- if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
404
  return mid
405
  elif x <= self.boundaries[mid]:
406
  return self._bisect(x, lo, mid)
 
 
1
  import os
2
  import random
3
+
4
  import numpy as np
5
  import torch
6
  import torch.utils.data
 
 
7
  from mel_processing import spectrogram_torch
8
+
9
  from utils import load_wav_to_torch, load_filepaths_and_text
 
10
 
11
 
12
  def dropout1d(myarray, ratio=0.5):
 
57
 
58
  def get_audio_text_pair(self, audiopath_and_text):
59
  # separate filename and text
60
+ audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1], audiopath_and_text[2]
61
  text = self.get_text(text)
62
  spec, wav = self.get_audio(audiopath)
63
  pitch = self.get_pitch(pitch)
64
+ return text, spec, wav, pitch
65
 
66
  def get_pitch(self, pitch):
67
 
 
97
  return len(self.audiopaths_and_text)
98
 
99
 
100
+ class TextAudioCollate:
101
  """ Zero-pads model inputs and targets
102
  """
103
 
 
121
  max_pitch_len = max([x[3].shape[0] for x in batch])
122
  # print(batch)
123
 
 
124
  text_lengths = torch.LongTensor(len(batch))
125
  spec_lengths = torch.LongTensor(len(batch))
126
  wav_lengths = torch.LongTensor(len(batch))
 
202
 
203
  def get_audio_text_speaker_pair(self, audiopath_sid_text):
204
  # separate filename, speaker_id and text
205
+ audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], \
206
+ audiopath_sid_text[3]
207
  text = self.get_text(text)
208
  spec, wav = self.get_audio(audiopath)
209
  sid = self.get_sid(sid)
210
  pitch = self.get_pitch(pitch)
211
 
212
+ return text, spec, wav, pitch, sid
213
 
214
  def get_audio(self, filename):
215
  audio, sampling_rate = load_wav_to_torch(filename)
 
233
  soft = np.load(text)
234
  text_norm = torch.FloatTensor(soft)
235
  return text_norm
236
+
237
  def get_pitch(self, pitch):
238
  return torch.LongTensor(np.load(pitch))
239
 
 
248
  return len(self.audiopaths_sid_text)
249
 
250
 
251
+ class TextAudioSpeakerCollate:
252
  """ Zero-pads model inputs and targets
253
  """
254
 
 
308
 
309
  if self.return_ids:
310
  return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing
311
+ return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid
312
 
313
 
314
  class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
 
398
 
399
  if hi > lo:
400
  mid = (hi + lo) // 2
401
+ if self.boundaries[mid] < x <= self.boundaries[mid + 1]:
402
  return mid
403
  elif x <= self.boundaries[mid]:
404
  return self._bisect(x, lo, mid)
hubert/__init__.py DELETED
@@ -1,8 +0,0 @@
1
- from .model import (
2
- Hubert,
3
- HubertDiscrete,
4
- HubertSoft,
5
- hubert_discrete,
6
- hubert_soft,
7
- kmeans100,
8
- )
 
 
 
 
 
 
 
 
 
hubert/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (281 Bytes)
 
hubert/__pycache__/model.cpython-38.pyc DELETED
Binary file (10 kB)
 
hubert/dataset.py DELETED
@@ -1,91 +0,0 @@
1
- import random
2
- from pathlib import Path
3
- import numpy as np
4
- import json
5
-
6
- import torch
7
- import torch.nn.functional as F
8
- from torch.utils.data import Dataset
9
- import torchaudio
10
-
11
-
12
- class AcousticUnitsDataset(Dataset):
13
- def __init__(
14
- self,
15
- root: Path,
16
- sample_rate: int = 16000,
17
- label_rate: int = 50,
18
- min_samples: int = 32000,
19
- max_samples: int = 250000,
20
- train: bool = True,
21
- ):
22
- self.wavs_dir = root / "wavs"
23
- self.units_dir = root / "units"
24
-
25
- with open(root / "lengths.json") as file:
26
- self.lenghts = json.load(file)
27
-
28
- pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac"
29
- metadata = (
30
- (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix())
31
- for path in self.wavs_dir.rglob(pattern)
32
- )
33
- metadata = ((path, key) for path, key in metadata if key in self.lenghts)
34
- self.metadata = [
35
- path for path, key in metadata if self.lenghts[key] > min_samples
36
- ]
37
-
38
- self.sample_rate = sample_rate
39
- self.label_rate = label_rate
40
- self.min_samples = min_samples
41
- self.max_samples = max_samples
42
- self.train = train
43
-
44
- def __len__(self):
45
- return len(self.metadata)
46
-
47
- def __getitem__(self, index):
48
- wav_path = self.metadata[index]
49
- units_path = self.units_dir / wav_path.relative_to(self.wavs_dir)
50
-
51
- wav, _ = torchaudio.load(wav_path)
52
- wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
53
- codes = np.load(units_path.with_suffix(".npy"))
54
-
55
- return wav, torch.from_numpy(codes).long()
56
-
57
- def collate(self, batch):
58
- wavs, codes = zip(*batch)
59
- wavs, codes = list(wavs), list(codes)
60
-
61
- wav_lengths = [wav.size(-1) for wav in wavs]
62
- code_lengths = [code.size(-1) for code in codes]
63
-
64
- wav_frames = min(self.max_samples, *wav_lengths)
65
-
66
- collated_wavs, wav_offsets = [], []
67
- for wav in wavs:
68
- wav_diff = wav.size(-1) - wav_frames
69
- wav_offset = random.randint(0, wav_diff)
70
- wav = wav[:, wav_offset : wav_offset + wav_frames]
71
-
72
- collated_wavs.append(wav)
73
- wav_offsets.append(wav_offset)
74
-
75
- rate = self.label_rate / self.sample_rate
76
- code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets]
77
- code_frames = round(wav_frames * rate)
78
- remaining_code_frames = [
79
- length - offset for length, offset in zip(code_lengths, code_offsets)
80
- ]
81
- code_frames = min(code_frames, *remaining_code_frames)
82
-
83
- collated_codes = []
84
- for code, code_offset in zip(codes, code_offsets):
85
- code = code[code_offset : code_offset + code_frames]
86
- collated_codes.append(code)
87
-
88
- wavs = torch.stack(collated_wavs, dim=0)
89
- codes = torch.stack(collated_codes, dim=0)
90
-
91
- return wavs, codes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hubert/utils.py DELETED
@@ -1,58 +0,0 @@
1
- import torch
2
-
3
-
4
- class Metric:
5
- def __init__(self):
6
- self.steps = 0
7
- self.value = 0
8
-
9
- def update(self, value):
10
- self.steps += 1
11
- self.value += (value - self.value) / self.steps
12
- return self.value
13
-
14
- def reset(self):
15
- self.steps = 0
16
- self.value = 0
17
-
18
-
19
- def save_checkpoint(
20
- checkpoint_dir,
21
- hubert,
22
- optimizer,
23
- scaler,
24
- step,
25
- loss,
26
- best,
27
- logger,
28
- ):
29
- state = {
30
- "hubert": hubert.state_dict(),
31
- "optimizer": optimizer.state_dict(),
32
- "scaler": scaler.state_dict(),
33
- "step": step,
34
- "loss": loss,
35
- }
36
- checkpoint_dir.mkdir(exist_ok=True, parents=True)
37
- checkpoint_path = checkpoint_dir / f"model-{step}.pt"
38
- torch.save(state, checkpoint_path)
39
- if best:
40
- best_path = checkpoint_dir / "model-best.pt"
41
- torch.save(state, best_path)
42
- logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
43
-
44
-
45
- def load_checkpoint(
46
- load_path,
47
- hubert,
48
- optimizer,
49
- scaler,
50
- rank,
51
- logger,
52
- ):
53
- logger.info(f"Loading checkpoint from {load_path}")
54
- checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
55
- hubert.load_state_dict(checkpoint["hubert"])
56
- scaler.load_state_dict(checkpoint["scaler"])
57
- optimizer.load_state_dict(checkpoint["optimizer"])
58
- return checkpoint["step"], checkpoint["loss"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hubert/model.py → hubert_model.py RENAMED
@@ -1,20 +1,12 @@
1
  import copy
2
- from typing import Optional, Tuple
3
  import random
4
-
5
- from sklearn.cluster import KMeans
6
 
7
  import torch
8
  import torch.nn as nn
9
- import torch.nn.functional as F
10
  from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
11
 
12
- URLS = {
13
- "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-discrete-e9416457.pt",
14
- "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt",
15
- "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.1/kmeans100-50f36a95.pt",
16
- }
17
-
18
 
19
  class Hubert(nn.Module):
20
  def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
@@ -44,7 +36,7 @@ class Hubert(nn.Module):
44
  return x, mask
45
 
46
  def encode(
47
- self, x: torch.Tensor, layer: Optional[int] = None
48
  ) -> Tuple[torch.Tensor, torch.Tensor]:
49
  x = self.feature_extractor(x)
50
  x = self.feature_projection(x.transpose(1, 2))
@@ -75,24 +67,11 @@ class HubertSoft(Hubert):
75
 
76
  @torch.inference_mode()
77
  def units(self, wav: torch.Tensor) -> torch.Tensor:
78
- wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
79
  x, _ = self.encode(wav)
80
  return self.proj(x)
81
 
82
 
83
- class HubertDiscrete(Hubert):
84
- def __init__(self, kmeans):
85
- super().__init__(504)
86
- self.kmeans = kmeans
87
-
88
- @torch.inference_mode()
89
- def units(self, wav: torch.Tensor) -> torch.LongTensor:
90
- wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
91
- x, _ = self.encode(wav, layer=7)
92
- x = self.kmeans.predict(x.squeeze().cpu().numpy())
93
- return torch.tensor(x, dtype=torch.long, device=wav.device)
94
-
95
-
96
  class FeatureExtractor(nn.Module):
97
  def __init__(self):
98
  super().__init__()
@@ -106,13 +85,13 @@ class FeatureExtractor(nn.Module):
106
  self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
107
 
108
  def forward(self, x: torch.Tensor) -> torch.Tensor:
109
- x = F.gelu(self.norm0(self.conv0(x)))
110
- x = F.gelu(self.conv1(x))
111
- x = F.gelu(self.conv2(x))
112
- x = F.gelu(self.conv3(x))
113
- x = F.gelu(self.conv4(x))
114
- x = F.gelu(self.conv5(x))
115
- x = F.gelu(self.conv6(x))
116
  return x
117
 
118
 
@@ -144,13 +123,13 @@ class PositionalConvEmbedding(nn.Module):
144
 
145
  def forward(self, x: torch.Tensor) -> torch.Tensor:
146
  x = self.conv(x.transpose(1, 2))
147
- x = F.gelu(x[:, :, :-1])
148
  return x.transpose(1, 2)
149
 
150
 
151
  class TransformerEncoder(nn.Module):
152
  def __init__(
153
- self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
154
  ) -> None:
155
  super(TransformerEncoder, self).__init__()
156
  self.layers = nn.ModuleList(
@@ -159,11 +138,11 @@ class TransformerEncoder(nn.Module):
159
  self.num_layers = num_layers
160
 
161
  def forward(
162
- self,
163
- src: torch.Tensor,
164
- mask: torch.Tensor = None,
165
- src_key_padding_mask: torch.Tensor = None,
166
- output_layer: Optional[int] = None,
167
  ) -> torch.Tensor:
168
  output = src
169
  for layer in self.layers[:output_layer]:
@@ -174,11 +153,11 @@ class TransformerEncoder(nn.Module):
174
 
175
 
176
  def _compute_mask(
177
- shape: Tuple[int, int],
178
- mask_prob: float,
179
- mask_length: int,
180
- device: torch.device,
181
- min_masks: int = 0,
182
  ) -> torch.Tensor:
183
  batch_size, sequence_length = shape
184
 
@@ -228,62 +207,17 @@ def _compute_mask(
228
  return mask
229
 
230
 
231
- def hubert_discrete(
232
- pretrained: bool = True,
233
- progress: bool = True,
234
- ) -> HubertDiscrete:
235
- r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
236
- Args:
237
- pretrained (bool): load pretrained weights into the model
238
- progress (bool): show progress bar when downloading model
239
- """
240
- kmeans = kmeans100(pretrained=pretrained, progress=progress)
241
- hubert = HubertDiscrete(kmeans)
242
- if pretrained:
243
- checkpoint = torch.hub.load_state_dict_from_url(
244
- URLS["hubert-discrete"], progress=progress
245
- )
246
- consume_prefix_in_state_dict_if_present(checkpoint, "module.")
247
- hubert.load_state_dict(checkpoint)
248
- hubert.eval()
249
- return hubert
250
-
251
-
252
  def hubert_soft(
253
- path: str
254
  ) -> HubertSoft:
255
  r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
256
  Args:
257
  path (str): path of a pretrained model
258
  """
259
- dev = torch.device("cpu" if torch.cuda.is_available() else "cpu")
260
  hubert = HubertSoft()
261
  checkpoint = torch.load(path)
262
  consume_prefix_in_state_dict_if_present(checkpoint, "module.")
263
  hubert.load_state_dict(checkpoint)
264
  hubert.eval().to(dev)
265
  return hubert
266
-
267
-
268
- def _kmeans(
269
- num_clusters: int, pretrained: bool = True, progress: bool = True
270
- ) -> KMeans:
271
- kmeans = KMeans(num_clusters)
272
- if pretrained:
273
- checkpoint = torch.hub.load_state_dict_from_url(
274
- URLS[f"kmeans{num_clusters}"], progress=progress
275
- )
276
- kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
277
- kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
278
- kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
279
- return kmeans
280
-
281
-
282
- def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
283
- r"""
284
- k-means checkpoint for HuBERT-Discrete with 100 clusters.
285
- Args:
286
- pretrained (bool): load pretrained weights into the model
287
- progress (bool): show progress bar when downloading model
288
- """
289
- return _kmeans(100, pretrained, progress)
 
1
  import copy
 
2
  import random
3
+ from typing import Optional, Tuple
 
4
 
5
  import torch
6
  import torch.nn as nn
7
+ import torch.nn.functional as t_func
8
  from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9
 
 
 
 
 
 
 
10
 
11
  class Hubert(nn.Module):
12
  def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
 
36
  return x, mask
37
 
38
  def encode(
39
+ self, x: torch.Tensor, layer: Optional[int] = None
40
  ) -> Tuple[torch.Tensor, torch.Tensor]:
41
  x = self.feature_extractor(x)
42
  x = self.feature_projection(x.transpose(1, 2))
 
67
 
68
  @torch.inference_mode()
69
  def units(self, wav: torch.Tensor) -> torch.Tensor:
70
+ wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
71
  x, _ = self.encode(wav)
72
  return self.proj(x)
73
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  class FeatureExtractor(nn.Module):
76
  def __init__(self):
77
  super().__init__()
 
85
  self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
86
 
87
  def forward(self, x: torch.Tensor) -> torch.Tensor:
88
+ x = t_func.gelu(self.norm0(self.conv0(x)))
89
+ x = t_func.gelu(self.conv1(x))
90
+ x = t_func.gelu(self.conv2(x))
91
+ x = t_func.gelu(self.conv3(x))
92
+ x = t_func.gelu(self.conv4(x))
93
+ x = t_func.gelu(self.conv5(x))
94
+ x = t_func.gelu(self.conv6(x))
95
  return x
96
 
97
 
 
123
 
124
  def forward(self, x: torch.Tensor) -> torch.Tensor:
125
  x = self.conv(x.transpose(1, 2))
126
+ x = t_func.gelu(x[:, :, :-1])
127
  return x.transpose(1, 2)
128
 
129
 
130
  class TransformerEncoder(nn.Module):
131
  def __init__(
132
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
133
  ) -> None:
134
  super(TransformerEncoder, self).__init__()
135
  self.layers = nn.ModuleList(
 
138
  self.num_layers = num_layers
139
 
140
  def forward(
141
+ self,
142
+ src: torch.Tensor,
143
+ mask: torch.Tensor = None,
144
+ src_key_padding_mask: torch.Tensor = None,
145
+ output_layer: Optional[int] = None,
146
  ) -> torch.Tensor:
147
  output = src
148
  for layer in self.layers[:output_layer]:
 
153
 
154
 
155
  def _compute_mask(
156
+ shape: Tuple[int, int],
157
+ mask_prob: float,
158
+ mask_length: int,
159
+ device: torch.device,
160
+ min_masks: int = 0,
161
  ) -> torch.Tensor:
162
  batch_size, sequence_length = shape
163
 
 
207
  return mask
208
 
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  def hubert_soft(
211
+ path: str
212
  ) -> HubertSoft:
213
  r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
214
  Args:
215
  path (str): path of a pretrained model
216
  """
217
+ dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
218
  hubert = HubertSoft()
219
  checkpoint = torch.load(path)
220
  consume_prefix_in_state_dict_if_present(checkpoint, "module.")
221
  hubert.load_state_dict(checkpoint)
222
  hubert.eval().to(dev)
223
  return hubert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
icassp2022_vocal_transcription/.gitignore DELETED
@@ -1,3 +0,0 @@
1
- output/
2
- audio/*
3
- !audio/test.wav
 
 
 
 
icassp2022_vocal_transcription/README.md DELETED
@@ -1,56 +0,0 @@
1
- # icassp2022-vocal-transcription
2
- Companion code for the paper:
3
- Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, Juhan Nam *"Pseudo-Label Transfer from Frame-level to Note-level in a Teacher-student Framework for Singing Transcription from Polyphonic Music"*, ICASSP2022, Singapore <[link](https://ieeexplore.ieee.org/document/9747147)>
4
-
5
-
6
- ## Abstract
7
-
8
- Lack of large-scale note-level labeled data is the major obstacle to singing transcription from polyphonic music. We address the issue by using pseudo labels from vocal pitch estimation models given unlabeled data. The proposed method first converts the frame-level pseudo labels to note-level through pitch and rhythm quantization steps. Then, it further improves the label quality through self- training in a teacher-student framework.
9
-
10
- <img src="./img/ICASSP2022-fig1-2.png" width="70%">
11
-
12
- To validate the method, we conduct various experiment settings by investigating two vocal pitch estimation models as pseudo-label generators, two setups of teacher-student frameworks, and the number of iterations in self-training. The results show that the proposed method can effectively leverage large-scale unlabeled audio data and self-training with the noisy student model helps to improve performance. Finally, we show that the model trained with only unlabeled data has comparable performance to previous works and the model trained with addi- tional labeled data achieves higher accuracy than the model trained with only labeled data.
13
-
14
- ## Demo video
15
- - <[Youtube Link 1](https://www.youtube.com/watch?v=wlD-GAGuj0M "Demo 1: Singing transcription from polpyphonic music")> You&I (IU)
16
- - <[Youtube Link 2](https://youtu.be/iitOC4vuC8U "Demo 2: Singing transcription from polpyphonic music")> You in my arms (Myung jin Moon)
17
-
18
-
19
- ## Dependencies
20
-
21
- - OS: LINUX
22
- - Programming language: Python 3.6+
23
- - Python Library
24
- - Keras 2.7.0 (Deep Learning library)
25
- - tensorflow 2.5.0 (Deep Learning library)
26
- - Librosa 0.8.1 (for STFT)
27
- - pydub 0.25.1 (for loading audio and resampling)
28
- - pretty-midi (for handling midi data)
29
- - Numpy, SciPy
30
-
31
- - Hardware
32
- - 1 GPU : GeForce GTX 3090
33
-
34
-
35
- ## Using STP from the command line
36
- ```
37
- $ python singing_transcription.py -i ../audio/test.wav -o ../output
38
-
39
- [optional arguments]
40
- -i path_audio Path to input audio file. (default: '../audio/pop1.wav')
41
- -o pathsave Path to folder for saving .mid file (default: '../output')
42
- -ot output_type (optional) Output type: midi or frame-level pitch score(fps) (default: 'midi')
43
- ```
44
- - output example: ADC04-pop1.wav
45
- <img src="./img/example_pop1_midi.png" width="100%">
46
- # Citation
47
- If you find our work useful, please consider citing our paper.
48
-
49
- ```
50
- @inproceedings{kum2022pseudo,
51
- title={Pseudo-Label Transfer from Frame-Level to Note-Level in a Teacher-Student Framework for Singing Transcription from Polyphonic Music},
52
- author={Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, and Juhan Nam},
53
- booktitle={Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
54
- year={2022}
55
- }
56
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
icassp2022_vocal_transcription/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- from .src import singing_transcription
2
-
3
- transcribe = singing_transcription.get_frame_level_output
 
 
 
 
icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (254 Bytes)
 
icassp2022_vocal_transcription/data/weight_ST.hdf5 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ba38c046af48a359575c1a312d931966e56d94013ad56dd91f2de5219afa8a4
3
- size 17535208
 
 
 
 
icassp2022_vocal_transcription/data/x_train_mean.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f977a72104d19c3b92c764a4fe1335f411ffc331bb6f81ec2420016f07fa772c
3
- size 4232
 
 
 
 
icassp2022_vocal_transcription/data/x_train_std.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a120cbf8bc8e62544f7b0ce1185b0244f3c6971fd50b3092c66a0fda1f5405a
3
- size 4232
 
 
 
 
icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png DELETED
Binary file (26.9 kB)
 
icassp2022_vocal_transcription/img/example_pop1_midi.png DELETED
Binary file (136 kB)
 
icassp2022_vocal_transcription/requirements.txt DELETED
@@ -1,8 +0,0 @@
1
- keras==2.7.0
2
- numpy==1.19.5
3
- librosa==0.8.1
4
- mir-eval==0.6
5
- pretty-midi==0.2.9
6
- pydub==0.25.1
7
- scipy==1.7.3
8
- tensorflow==2.5.0
 
 
 
 
 
 
 
 
 
icassp2022_vocal_transcription/src/MIDI.py DELETED
@@ -1,141 +0,0 @@
1
- #%%
2
- import pretty_midi
3
- import numpy as np
4
- import librosa.display
5
-
6
-
7
- #%%
8
- def plot_piano_roll(pm, start_pitch, end_pitch, fs=100):
9
- """ Plot piano roll from .mid file
10
- ----------
11
- Parameters:
12
- pm: RWC, MDB, iKala, DSD100
13
- start/end_pitch: lowest/highest note (float)
14
- fs: sampling freq. (int)
15
-
16
- """
17
- # Use librosa's specshow function for displaying the piano roll
18
- librosa.display.specshow(
19
- pm.get_piano_roll(fs)[start_pitch:end_pitch],
20
- hop_length=1,
21
- sr=fs,
22
- x_axis="time",
23
- y_axis="cqt_note",
24
- fmin=pretty_midi.note_number_to_hz(start_pitch),
25
- )
26
-
27
-
28
- def midi_to_note(file_name, pitch_shift, fs=100, start_note=40, end_note=95):
29
- """ Convert .mid to note
30
- ----------
31
- Parameters:
32
- file_name: '.mid' (str)
33
- pitch_sifht: shift the pitch to adjust notes correctly (int)
34
- fs: sampling freq. (int)
35
- start/end_pitch: lowest/highest note(int)
36
-
37
- ----------
38
- Returns:
39
- notes: note/10ms (array)
40
- """
41
-
42
- pm = pretty_midi.PrettyMIDI(file_name)
43
- frame_note = pm.get_piano_roll(fs)[start_note:end_note]
44
-
45
- length_audio = frame_note.shape[1]
46
- notes = np.zeros(length_audio)
47
-
48
- for i in range(length_audio):
49
- note_tmp = np.argmax(frame_note[:, i])
50
- if note_tmp > 0:
51
- notes[i] = (note_tmp + start_note) + pitch_shift
52
- # note[i] = 2 ** ((note_tmp -69) / 12.) * 440
53
- return notes
54
-
55
-
56
- def midi_to_segment(filename):
57
- """ Convert .mid to segment
58
- ----------
59
- Parameters:
60
- filename: .mid (str)
61
-
62
- ----------
63
- Returns:
64
- segments: [start(s),end(s),pitch] (list)
65
- """
66
-
67
- pm = pretty_midi.PrettyMIDI(filename)
68
- segment = []
69
- for note in pm.instruments[0].notes:
70
- segment.append([note.start, note.end, note.pitch])
71
- return segment
72
-
73
-
74
- def segment_to_midi(segments, path_output, tempo=120):
75
- """ Convert segment to .mid
76
- ----------
77
- Parameters:
78
- segments: [start(s),end(s),pitch] (list)
79
- path_output: path of save file (str)
80
- """
81
- pm = pretty_midi.PrettyMIDI(initial_tempo=int(tempo))
82
- inst_program = pretty_midi.instrument_name_to_program("Acoustic Grand Piano")
83
- inst = pretty_midi.Instrument(program=inst_program)
84
- for segment in segments:
85
- note = pretty_midi.Note(
86
- velocity=100, start=segment[0], end=segment[1], pitch=np.int(segment[2])
87
- )
88
- inst.notes.append(note)
89
- pm.instruments.append(inst)
90
- pm.write(f"{path_output}")
91
-
92
-
93
- def note_to_segment(note):
94
- """ Convert note to segment
95
- ----------
96
- Parameters:
97
- note: note/10ms (array)
98
- ----------
99
- Returns:
100
- segments: [start(s),end(s),pitch] (list)
101
- """
102
- startSeg = []
103
- endSeg = []
104
- notes = []
105
- flag = -1
106
-
107
- if note[0] > 0:
108
- startSeg.append(0)
109
- notes.append(np.int(note[0]))
110
- flag *= -1
111
- for i in range(0, len(note) - 1):
112
- if note[i] != note[i + 1]:
113
- if flag < 0:
114
- startSeg.append(0.01 * (i + 1))
115
- notes.append(np.int(note[i + 1]))
116
- flag *= -1
117
- else:
118
- if note[i + 1] == 0:
119
- endSeg.append(0.01 * i)
120
- flag *= -1
121
- else:
122
- endSeg.append(0.01 * i)
123
- startSeg.append(0.01 * (i + 1))
124
- notes.append(np.int(note[i + 1]))
125
-
126
- return list(zip(startSeg, endSeg, notes))
127
-
128
-
129
- def note2Midi(frame_level_pitchscroe, path_output, tempo):
130
- # note = np.loadtxt(path_input_note)
131
- # note = note[:, 1]
132
- segment = note_to_segment(frame_level_pitchscroe)
133
- segment_to_midi(segment, path_output=path_output, tempo=tempo)
134
-
135
-
136
- # def note2Midi(path_input_note, path_output, tempo):
137
- # note = np.loadtxt(path_input_note)
138
- # note = note[:, 1]
139
- # segment = note_to_segment(note)
140
- # segment_to_midi(segment, path_output=path_output, tempo=tempo)
141
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
icassp2022_vocal_transcription/src/__init__.py DELETED
File without changes
icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc DELETED
Binary file (3.48 kB)
 
icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (165 Bytes)
 
icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc DELETED
Binary file (1.74 kB)
 
icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc DELETED
Binary file (3.1 kB)
 
icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc DELETED
Binary file (4.92 kB)
 
icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc DELETED
Binary file (3.99 kB)
 
icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc DELETED
Binary file (1.5 kB)
 
icassp2022_vocal_transcription/src/featureExtraction.py DELETED
@@ -1,61 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- import librosa
3
- from pydub import AudioSegment
4
- import pathlib
5
-
6
- # from pydub.playback import play
7
- import numpy as np
8
- import os
9
-
10
- PATH_PROJECT = os.path.dirname(os.path.realpath(__file__))
11
-
12
-
13
- def read_audio(filepath, sr=None):
14
- path = pathlib.Path(filepath)
15
- extenstion = path.suffix.replace(".", "")
16
- if extenstion == "mp3":
17
- sound = AudioSegment.from_mp3(filepath)
18
- else:
19
- sound = AudioSegment.from_file(filepath)
20
- # sound = sound[start * 1000 : end * 1000]
21
- sound = sound.set_channels(1)
22
- if sr == None:
23
- sr = sound.frame_rate
24
- sound = sound.set_frame_rate(sr)
25
- samples = sound.get_array_of_samples()
26
- y = np.array(samples).T.astype(np.float32)
27
-
28
- return y, sr
29
-
30
-
31
- def spec_extraction(file_name, win_size):
32
-
33
- y, _ = read_audio(file_name, sr=8000)
34
-
35
- S = librosa.core.stft(y, n_fft=1024, hop_length=80, win_length=1024)
36
- x_spec = np.abs(S)
37
- x_spec = librosa.core.power_to_db(x_spec, ref=np.max)
38
- x_spec = x_spec.astype(np.float32)
39
- num_frames = x_spec.shape[1]
40
-
41
- # for padding
42
- padNum = num_frames % win_size
43
- if padNum != 0:
44
- len_pad = win_size - padNum
45
- padding_feature = np.zeros(shape=(513, len_pad))
46
- x_spec = np.concatenate((x_spec, padding_feature), axis=1)
47
- num_frames = num_frames + len_pad
48
-
49
- x_test = []
50
- for j in range(0, num_frames, win_size):
51
- x_test_tmp = x_spec[:, range(j, j + win_size)].T
52
- x_test.append(x_test_tmp)
53
- x_test = np.array(x_test)
54
-
55
- # for standardization
56
- path_project = pathlib.Path(__file__).parent.parent
57
- x_train_mean = np.load(f"{path_project}/data/x_train_mean.npy")
58
- x_train_std = np.load(f"{path_project}/data/x_train_std.npy")
59
- x_test = (x_test - x_train_mean) / (x_train_std + 0.0001)
60
- x_test = x_test[:, :, :, np.newaxis]
61
- return x_test, x_spec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
icassp2022_vocal_transcription/src/model.py DELETED
@@ -1,139 +0,0 @@
1
- # import keras.backend as KK
2
- import math
3
- from tensorflow.keras import backend as K
4
- from tensorflow.keras.regularizers import l2
5
- from tensorflow.keras.models import Model
6
- from tensorflow.keras.layers import (
7
- Conv2D,
8
- MaxPooling2D,
9
- BatchNormalization,
10
- LeakyReLU,
11
- Dropout,
12
- LSTM,
13
- Reshape,
14
- Bidirectional,
15
- TimeDistributed,
16
- Input,
17
- add,
18
- concatenate,
19
- Lambda,
20
- Dense,
21
- Activation,
22
- )
23
-
24
-
25
- # --------------------------------------------------------------------------------
26
- def ResNet_Block(input, block_id, filterNum):
27
- x = Conv2D(
28
- filterNum,
29
- (1, 1),
30
- name="conv_s" + str(block_id) + "_1x1",
31
- padding="same",
32
- kernel_initializer="he_normal",
33
- use_bias=False,
34
- )(input)
35
- shortcut = BatchNormalization()(x)
36
- x = LeakyReLU(0.01)(shortcut)
37
-
38
- x = Conv2D(
39
- filterNum,
40
- (3, 3),
41
- name="conv" + str(block_id) + "_1",
42
- padding="same",
43
- kernel_initializer="he_normal",
44
- use_bias=False,
45
- kernel_regularizer=l2(1e-5),
46
- )(x)
47
- x = BatchNormalization()(x)
48
- x = LeakyReLU(0.01)(x)
49
-
50
- # x = Dropout(0.3)(x)
51
-
52
- x = Conv2D(
53
- filterNum,
54
- (3, 3),
55
- name="conv" + str(block_id) + "_2",
56
- padding="same",
57
- kernel_initializer="he_normal",
58
- use_bias=False,
59
- kernel_regularizer=l2(1e-5),
60
- )(x)
61
- x = BatchNormalization()(x)
62
- x = LeakyReLU(0.01)(x)
63
-
64
- x = Conv2D(
65
- filterNum,
66
- (1, 1),
67
- name="conv_f" + str(block_id) + "_1x1",
68
- padding="same",
69
- kernel_initializer="he_normal",
70
- use_bias=False,
71
- )(x)
72
- x = BatchNormalization()(x)
73
-
74
- x = add([x, shortcut])
75
- x = LeakyReLU(0.01)(x)
76
- x = MaxPooling2D((1, 4))(x)
77
- return x
78
-
79
-
80
- def melody_ResNet_JDC(num_spec, window_size, note_res):
81
-
82
- num_output = int(55 * 2 ** (math.log(note_res, 2)) + 2)
83
- input = Input(shape=(window_size, num_spec, 1))
84
- block_1 = ResNet_Block(input=input, block_id=1, filterNum=64)
85
- block_2 = ResNet_Block(input=block_1, block_id=2, filterNum=128)
86
- block_3 = ResNet_Block(input=block_2, block_id=3, filterNum=192)
87
- block_4 = ResNet_Block(input=block_3, block_id=4, filterNum=256)
88
- block_4_dp = Dropout(0.3)(block_4)
89
-
90
- keras_shape = K.int_shape(block_4)
91
- numOutput_P = keras_shape[2] * keras_shape[3]
92
- output_tmp = Reshape((window_size, numOutput_P))(block_4_dp)
93
-
94
- # voicing
95
- block_1 = MaxPooling2D((1, 4 ** 3))(block_1)
96
- block_2 = MaxPooling2D((1, 4 ** 2))(block_2)
97
- block_3 = MaxPooling2D((1, 4 ** 1))(block_3)
98
- joint = concatenate([block_1, block_2, block_3, block_4])
99
- joint = Dropout(0.3)(joint)
100
- joint = Conv2D(
101
- 256,
102
- (1, 1),
103
- padding="same",
104
- kernel_initializer="he_normal",
105
- use_bias=False,
106
- kernel_regularizer=l2(1e-5),
107
- )(joint)
108
- joint = BatchNormalization()(joint)
109
- joint = LeakyReLU(0.01)(joint)
110
-
111
- keras_shape2 = K.int_shape(joint)
112
- num_V = keras_shape2[2] * keras_shape2[3]
113
-
114
- output_V_tmp = Reshape((window_size, num_V))(joint)
115
- output_V_tmp = Bidirectional(LSTM(32, return_sequences=True, stateful=False, dropout=0.2))(
116
- output_V_tmp
117
- )
118
- output_V = TimeDistributed(Dense(2))(output_V_tmp)
119
- output_V = TimeDistributed(Activation("softmax"), name="output_AUX_V")(output_V)
120
-
121
- # output
122
- output_tmp = Bidirectional(LSTM(256, return_sequences=True, dropout=0.2))(output_tmp)
123
- output_tmp = concatenate([output_tmp, output_V_tmp])
124
- output = TimeDistributed(Dense(num_output))(output_tmp)
125
- output = TimeDistributed(Activation("softmax"), name="output")(output)
126
-
127
- output_NS = Lambda(lambda x: x[:, :, 0])(output)
128
- output_NS = Reshape((window_size, 1))(output_NS)
129
-
130
- output_S = Lambda(lambda x: 1 - x[:, :, 0])(output)
131
- output_S = Reshape((window_size, 1))(output_S)
132
- output_PV = concatenate([output_NS, output_S])
133
-
134
- output_V_F = concatenate([output_V, output_PV])
135
- output_V_F = TimeDistributed(Dense(2))(output_V_F)
136
- output_V_F = TimeDistributed(Activation("softmax"), name="output_V")(output_V_F)
137
- model = Model(inputs=input, outputs=[output, output_V_F])
138
-
139
- return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
icassp2022_vocal_transcription/src/quantization.py DELETED
@@ -1,217 +0,0 @@
1
- # %%
2
- import numpy as np
3
- import librosa
4
- import librosa.display
5
-
6
- from scipy.signal import medfilt
7
- from matplotlib import pyplot as plt
8
- from .featureExtraction import read_audio
9
- from .utils import *
10
-
11
-
12
- # %%
13
- def calc_tempo(path_audio):
14
- """ Calculate audio tempo
15
- ----------
16
- Parameters:
17
- path_audio: str
18
-
19
- ----------
20
- Returns:
21
- tempo: float
22
-
23
- """
24
- target_sr = 22050
25
- y, _ = read_audio(path_audio, sr=target_sr)
26
- onset_strength = librosa.onset.onset_strength(y, sr=target_sr)
27
- tempo = librosa.beat.tempo(onset_envelope=onset_strength, sr=target_sr)
28
- return tempo
29
-
30
-
31
- def one_beat_frame_size(tempo):
32
- """ Calculate frame size of 1 beat
33
- ----------
34
- Parameters:
35
- tempo: float
36
-
37
- ----------
38
- Returns:
39
- tempo: int
40
-
41
- """
42
- return np.int(np.round(60 / tempo * 100))
43
-
44
-
45
- def median_filter_pitch(pitch, medfilt_size, weight):
46
- """ Smoothing pitch using median filter
47
- ----------
48
- Parameters:
49
- pitch: array
50
- medfilt_size: int
51
- weight: float
52
-
53
- ----------
54
- Returns:
55
- pitch: array
56
-
57
- """
58
-
59
- medfilt_size = np.int(medfilt_size * weight)
60
- if medfilt_size % 2 == 0:
61
- medfilt_size += 1
62
- return np.round(medfilt(pitch, medfilt_size))
63
-
64
-
65
- def clean_note_frames(note, min_note_len=5):
66
- """ Remove short pitch frames
67
- ----------
68
- Parameters:
69
- note: array
70
- min_note_len: int
71
-
72
- ----------
73
- Returns:
74
- output: array
75
-
76
- """
77
-
78
- prev_pitch = 0
79
- prev_pitch_start = 0
80
- output = np.copy(note)
81
- for i in range(len(note)):
82
- pitch = note[i]
83
- if pitch != prev_pitch:
84
- prev_pitch_duration = i - prev_pitch_start
85
- if prev_pitch_duration < min_note_len:
86
- output[prev_pitch_start:i] = [0] * prev_pitch_duration
87
- prev_pitch = pitch
88
- prev_pitch_start = i
89
- return output
90
-
91
-
92
- def makeSegments(note):
93
- """ Make segments of notes
94
- ----------
95
- Parameters:
96
- note: array
97
-
98
- ----------
99
- Returns:
100
- startSeg: starting points (array)
101
- endSeg: ending points (array)
102
-
103
- """
104
- startSeg = []
105
- endSeg = []
106
- flag = -1
107
- if note[0] > 0:
108
- startSeg.append(0)
109
- flag *= -1
110
- for i in range(0, len(note) - 1):
111
- if note[i] != note[i + 1]:
112
- if flag < 0:
113
- startSeg.append(i + 1)
114
- flag *= -1
115
- else:
116
- if note[i + 1] == 0:
117
- endSeg.append(i)
118
- flag *= -1
119
- else:
120
- endSeg.append(i)
121
- startSeg.append(i + 1)
122
- return startSeg, endSeg
123
-
124
-
125
- def remove_short_segment(idx, note_cleaned, start, end, minLength):
126
- """ Remove short segments
127
- ----------
128
- Parameters:
129
- idx: (int)
130
- note_cleaned: (array)
131
- start: starting points (array)
132
- end: ending points (array)
133
- minLength: (int)
134
-
135
- ----------
136
- Returns:
137
- note_cleaned: (array)
138
-
139
- """
140
-
141
- len_seg = end[idx] - start[idx]
142
- if len_seg < minLength:
143
- if (start[idx + 1] - end[idx] > minLength) and (start[idx] - end[idx - 1] > minLength):
144
- note_cleaned[start[idx] : end[idx] + 1] = [0] * (len_seg + 1)
145
- return note_cleaned
146
-
147
-
148
- def remove_octave_error(idx, note_cleaned, start, end):
149
- """ Remove octave error
150
- ----------
151
- Parameters:
152
- idx: (int)
153
- note_cleaned: (array)
154
- start: starting points (array)
155
- end: ending points (array)
156
-
157
- ----------
158
- Returns:
159
- note_cleaned: (array)
160
-
161
- """
162
- len_seg = end[idx] - start[idx]
163
- if (note_cleaned[start[idx - 1]] == note_cleaned[start[idx + 1]]) and (
164
- note_cleaned[start[idx]] != note_cleaned[start[idx + 1]]
165
- ):
166
- if np.abs(note_cleaned[start[idx]] - note_cleaned[start[idx + 1]]) % 12 == 0:
167
- note_cleaned[start[idx] - 1 : end[idx] + 1] = [note_cleaned[start[idx + 1]]] * (
168
- len_seg + 2
169
- )
170
- return note_cleaned
171
-
172
-
173
- def clean_segment(note, minLength):
174
- """ clean note segments
175
- ----------
176
- Parameters:
177
- note: (array)
178
- minLength: (int)
179
-
180
- ----------
181
- Returns:
182
- note_cleaned: (array)
183
-
184
- """
185
-
186
- note_cleaned = np.copy(note)
187
- start, end = makeSegments(note_cleaned)
188
-
189
- for i in range(1, len(start) - 1):
190
- note_cleaned = remove_short_segment(i, note_cleaned, start, end, minLength)
191
- note_cleaned = remove_octave_error(i, note_cleaned, start, end)
192
- return note_cleaned
193
-
194
-
195
- def refine_note(est_note, tempo):
196
- """ main: refine note segments
197
- ----------
198
- Parameters:
199
- est_note: (array)
200
- tempo: (float)
201
-
202
- ----------
203
- Returns:
204
- est_pitch_mf3_v: (array)
205
-
206
- """
207
- one_beat_size = one_beat_frame_size(tempo)
208
- est_note_mf1 = median_filter_pitch(est_note, one_beat_size, 1 / 8)
209
- est_note_mf2 = median_filter_pitch(est_note_mf1, one_beat_size, 1 / 4)
210
- est_note_mf3 = median_filter_pitch(est_note_mf2, one_beat_size, 1 / 3)
211
-
212
- vocing = est_note_mf1 > 0
213
- est_pitch_mf3_v = vocing * est_note_mf3
214
- est_pitch_mf3_v = clean_note_frames(est_pitch_mf3_v, int(one_beat_size * 1 / 8))
215
- est_pitch_mf3_v = clean_segment(est_pitch_mf3_v, int(one_beat_size * 1 / 4))
216
- return est_pitch_mf3_v
217
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
icassp2022_vocal_transcription/src/singing_transcription.py DELETED
@@ -1,147 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # %%
3
- import argparse
4
- import numpy as np
5
- from pathlib import Path
6
- from .model import *
7
- from .featureExtraction import *
8
- from .quantization import *
9
- from .utils import *
10
- from .MIDI import *
11
-
12
- # %%
13
- class SingingTranscription:
14
- def __init__(self):
15
-
16
- self.PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
17
- self.num_spec = 513
18
- self.window_size = 31
19
- self.note_res = 1
20
- self.batch_size = 64
21
-
22
- def load_model(self, path_weight, TF_summary=False):
23
-
24
- model = melody_ResNet_JDC(self.num_spec, self.window_size, self.note_res)
25
- model.load_weights(path_weight)
26
- if TF_summary == True:
27
- print(model.summary())
28
- return model
29
-
30
- def predict_melody(self, model_ST, filepath):
31
- pitch_range = np.arange(40, 95 + 1.0 / self.note_res, 1.0 / self.note_res)
32
- pitch_range = np.concatenate([np.zeros(1), pitch_range])
33
-
34
- """ Features extraction"""
35
- X_test, _ = spec_extraction(file_name=filepath, win_size=self.window_size)
36
-
37
- """ melody predict"""
38
- y_predict = model_ST.predict(X_test, batch_size=self.batch_size, verbose=1)
39
- y_predict = y_predict[0] # [0]:note, [1]:vocing
40
- y_shape = y_predict.shape
41
- num_total = y_shape[0] * y_shape[1]
42
- y_predict = np.reshape(y_predict, (num_total, y_shape[2]))
43
-
44
- est_MIDI = np.zeros(num_total)
45
- est_freq = np.zeros(num_total)
46
- for i in range(num_total):
47
- index_predict = np.argmax(y_predict[i])
48
- pitch_MIDI = pitch_range[np.int32(index_predict)]
49
- if pitch_MIDI >= 40 and pitch_MIDI <= 95:
50
- est_MIDI[i] = pitch_MIDI
51
- # est_freq[i] = 2 ** ((pitch_MIDI - 69) / 12.0) * 440
52
- return est_MIDI
53
-
54
- def save_output_frame_level(self, pitch_score, path_save, note_or_freq="note"):
55
- check_and_make_dir(Path(path_save))
56
- f = open(path_save, "w")
57
-
58
- assert (note_or_freq == "freq") or (note_or_freq == "note"), "please check 'note' or 'freq"
59
- if note_or_freq == "freq":
60
- for j in range(len(pitch_score)):
61
- if pitch_score[j] > 0:
62
- pitch_score[j] = 2 ** ((pitch_score[j] - 69) / 12.0) * 440
63
- est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
64
- f.write(est)
65
- elif note_or_freq == "note":
66
- for j in range(len(pitch_score)):
67
- est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
68
- f.write(est)
69
-
70
- f.close()
71
-
72
-
73
- def main(args):
74
- ST = SingingTranscription()
75
-
76
- """ load model """
77
- model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
78
-
79
- """ predict note (time-freq) """
80
- path_audio = args.path_audio
81
- fl_note = ST.predict_melody(model_ST, path_audio) # frame-level pitch score
82
-
83
- """ post-processing """
84
- tempo = calc_tempo(path_audio)
85
- refined_fl_note = refine_note(fl_note, tempo) # frame-level pitch score
86
-
87
- """ convert frame-level pitch score to note-level (time-axis) """
88
- segment = note_to_segment(refined_fl_note) # note-level pitch score
89
-
90
- """ save ouput to .mid """
91
- filename = get_filename_wo_extension(path_audio)
92
- path_output = f"{args.path_save}/{filename}.mid"
93
- segment_to_midi(segment, path_output=path_output, tempo=tempo)
94
-
95
- if args.output_type == "fps":
96
- path_note = f"{args.path_save}/{filename}.txt"
97
- ST.save_output_frame_level(refined_fl_note, path_note, note_or_freq="freq")
98
-
99
- print(f"\n========= DONE =========")
100
- print(f"input: '{path_audio}'")
101
- print(f"output: '{path_output}'")
102
-
103
- ST = SingingTranscription()
104
-
105
- """ load model """
106
- model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
107
-
108
-
109
- def get_frame_level_output(wav_path):
110
- """ predict note (time-freq) """
111
- path_audio = wav_path
112
- fl_note = ST.predict_melody(model_ST, path_audio) # frame-level pitch score
113
-
114
- """ post-processing """
115
- tempo = calc_tempo(path_audio)
116
- refined_fl_note = refine_note(fl_note, tempo) # frame-level pitch score
117
- return refined_fl_note.astype(int)
118
-
119
-
120
- # %%
121
- if __name__ == "__main__":
122
- PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
123
- parser = argparse.ArgumentParser(description="Predict singing transcription")
124
- parser.add_argument(
125
- "-i",
126
- "--path_audio",
127
- type=str,
128
- help="Path to input audio file.",
129
- default=f"{PATH_PROJECT}/audio/pop1.wav",
130
- )
131
- parser.add_argument(
132
- "-o",
133
- "--path_save",
134
- type=str,
135
- help="Path to folder for saving .mid file",
136
- default=f"{PATH_PROJECT}/output",
137
- )
138
-
139
- parser.add_argument(
140
- "-ot",
141
- "--output_type",
142
- type=str,
143
- help="(optional) Output type: midi or frame-level pitch score(fps)",
144
- default="midi",
145
- )
146
-
147
- main(parser.parse_args())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
icassp2022_vocal_transcription/src/utils.py DELETED
@@ -1,49 +0,0 @@
1
- import os
2
- import numpy as np
3
- from pydub import AudioSegment
4
- import pathlib
5
-
6
-
7
- def check_and_make_dir(path_dir):
8
- if not os.path.exists(os.path.dirname(path_dir)):
9
- os.makedirs(os.path.dirname(path_dir))
10
-
11
-
12
- def get_filename_wo_extension(path_dir):
13
- return pathlib.Path(path_dir).stem
14
-
15
-
16
- def note2pitch(pitch):
17
- """ Convert MIDI number to freq.
18
- ----------
19
- Parameters:
20
- pitch: MIDI note numbers of pitch (array)
21
-
22
- ----------
23
- Returns:
24
- pitch: freqeuncy of pitch (array)
25
- """
26
-
27
- pitch = np.array(pitch)
28
- pitch[pitch > 0] = 2 ** ((pitch[pitch > 0] - 69) / 12.0) * 440
29
- return pitch
30
-
31
-
32
- def pitch2note(pitch):
33
- """ Convert freq to MIDI number
34
- ----------
35
- Parameters:
36
- pitch: freqeuncy of pitch (array)
37
-
38
- ----------
39
- Returns:
40
- pitch: MIDI note numbers of pitch (array)
41
- """
42
- pitch = np.array(pitch)
43
- pitch[pitch > 0] = np.round((69.0 + 12.0 * np.log2(pitch[pitch > 0] / 440.0)))
44
- return pitch
45
-
46
-
47
- a = np.array([0, 0, 0, 1, 2, 3, 5, 0, 0, 0, 1, 2, 4, 5])
48
- b = a[a > 0] * 2
49
- print(b)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer_tool.py CHANGED
@@ -1,100 +1,175 @@
 
1
  import os
 
2
 
 
3
  import numpy as np
4
- import soundfile
5
  import torch
6
  import torchaudio
7
- from pydub import AudioSegment
8
 
9
- dev = torch.device("cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
- def get_units(path, hubert_soft):
13
- source, sr = torchaudio.load(path)
 
 
 
 
 
 
 
 
 
14
  source = torchaudio.functional.resample(source, sr, 16000)
 
 
15
  source = source.unsqueeze(0).to(dev)
16
  with torch.inference_mode():
17
  units = hubert_soft.units(source)
18
  return units
19
 
20
 
21
- def transcribe(path, length, transform, feature_input):
22
- feature_pit = feature_input.compute_f0(path)
23
  feature_pit = feature_pit * 2 ** (transform / 12)
24
  feature_pit = resize2d_f0(feature_pit, length)
25
  coarse_pit = feature_input.coarse_f0(feature_pit)
26
  return coarse_pit
27
 
28
 
29
- def resize2d_plus(source, target_len):
30
- source = source.astype(float)
31
- source[source < 0.001] = np.nan
32
- target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
33
- source)
34
- res = np.nan_to_num(target)
35
- ret = res[:].astype(int)
36
- # 若调整大小时采样到中间的点,则以上一个点作为当前音高值
37
- for i in range(len(res)):
38
- if res[i] - ret[i] > 0.001:
39
- ret[i] = ret[i - 1]
40
- return ret
 
 
 
 
 
 
41
 
 
 
 
42
 
43
- def infer(file_name, speaker_id, tran, target_sample, net_g_ms, hubert_soft, feature_input):
44
- source_path = "./wav_temp/input/" + file_name
45
- audio, sample_rate = torchaudio.load(source_path)
46
- input_size = audio.shape[-1]
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  sid = torch.LongTensor([int(speaker_id)]).to(dev)
49
- soft = get_units(source_path, hubert_soft).squeeze(0).cpu().numpy()
50
- pitch = transcribe(source_path, soft.shape[0], tran, feature_input)
51
- pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
52
  stn_tst = torch.FloatTensor(soft)
53
  with torch.no_grad():
54
  x_tst = stn_tst.unsqueeze(0).to(dev)
55
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
56
  audio = \
57
- net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=.3, noise_scale_w=0.5,
58
  length_scale=1)[0][
59
  0, 0].data.float().cpu().numpy()
60
- soundfile.write("./wav_temp/output/" + file_name, audio,
61
- int(audio.shape[0] / input_size * target_sample))
62
-
63
-
64
- def resize2d_f0(x, target_len):
65
- source = np.array(x)
66
- source[source < 0.001] = np.nan
67
- target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
68
- source)
69
- res = np.nan_to_num(target)
70
- return res
71
-
72
 
73
- # python删除文件的方法 os.remove(path)path指的是文件的绝对路径,如:
74
- def del_file(path_data):
75
- for i in os.listdir(path_data): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
76
- os.remove(path_data + i)
77
 
 
 
 
78
 
79
- def cut(c_time, file_path, vocal_name, out_dir):
80
- audio_segment = AudioSegment.from_file(file_path, format='wav')
81
 
82
- total = int(audio_segment.duration_seconds / c_time) # 计算音频切片后的个数
83
- for i in range(total):
84
- # 将音频10s切片,并以顺序进行命名
85
- audio_segment[i * c_time * 1000:(i + 1) * c_time * 1000].export(f"{out_dir}/{vocal_name}-{i}.wav",
86
- format="wav")
87
- audio_segment[total * c_time * 1000:].export(f"{out_dir}/{vocal_name}-{total}.wav", format="wav") # 缺少结尾的音频片段
88
-
89
-
90
- def wav_resample(audio_path, tar_sample):
91
  raw_audio, raw_sample_rate = torchaudio.load(audio_path)
92
- audio_22050 = torchaudio.transforms.Resample(orig_freq=raw_sample_rate, new_freq=tar_sample)(raw_audio)[0]
93
- soundfile.write(audio_path, audio_22050, tar_sample)
94
- return audio_path
 
 
95
 
96
 
97
  def fill_a_to_b(a, b):
98
  if len(a) < len(b):
99
  for _ in range(0, len(b) - len(a)):
100
  a.append(a[0])
 
 
 
 
 
 
 
1
+ import logging
2
  import os
3
+ import time
4
 
5
+ import matplotlib.pyplot as plt
6
  import numpy as np
 
7
  import torch
8
  import torchaudio
 
9
 
10
+ import hubert_model
11
+ import utils
12
+ from models import SynthesizerTrn
13
+ from preprocess_wave import FeatureInput
14
+
15
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
16
+ dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+
18
+
19
+ def timeit(func):
20
+ def run(*args, **kwargs):
21
+ t = time.time()
22
+ res = func(*args, **kwargs)
23
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
24
+ return res
25
+
26
+ return run
27
+
28
+
29
+ def get_end_file(dir_path, end):
30
+ file_lists = []
31
+ for root, dirs, files in os.walk(dir_path):
32
+ files = [f for f in files if f[0] != '.']
33
+ dirs[:] = [d for d in dirs if d[0] != '.']
34
+ for f_file in files:
35
+ if f_file.endswith(end):
36
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
37
+ return file_lists
38
+
39
+
40
+ def load_model(model_path, config_path):
41
+ # 获取模型配置
42
+ hps_ms = utils.get_hparams_from_file(config_path)
43
+ n_g_ms = SynthesizerTrn(
44
+ 178,
45
+ hps_ms.data.filter_length // 2 + 1,
46
+ hps_ms.train.segment_size // hps_ms.data.hop_length,
47
+ n_speakers=hps_ms.data.n_speakers,
48
+ **hps_ms.model)
49
+ _ = utils.load_checkpoint(model_path, n_g_ms, None)
50
+ _ = n_g_ms.eval().to(dev)
51
+ # 加载hubert
52
+ hubert_soft = hubert_model.hubert_soft(get_end_file("./", "pt")[0])
53
+ feature_input = FeatureInput(hps_ms.data.sampling_rate, hps_ms.data.hop_length)
54
+ return n_g_ms, hubert_soft, feature_input, hps_ms
55
 
56
 
57
+ def resize2d_f0(x, target_len):
58
+ source = np.array(x)
59
+ source[source < 0.001] = np.nan
60
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
61
+ source)
62
+ res = np.nan_to_num(target)
63
+ return res
64
+
65
+
66
+ def get_units(in_path, hubert_soft):
67
+ source, sr = torchaudio.load(in_path)
68
  source = torchaudio.functional.resample(source, sr, 16000)
69
+ if len(source.shape) == 2 and source.shape[1] >= 2:
70
+ source = torch.mean(source, dim=0).unsqueeze(0)
71
  source = source.unsqueeze(0).to(dev)
72
  with torch.inference_mode():
73
  units = hubert_soft.units(source)
74
  return units
75
 
76
 
77
+ def transcribe(source_path, length, transform, feature_input):
78
+ feature_pit = feature_input.compute_f0(source_path)
79
  feature_pit = feature_pit * 2 ** (transform / 12)
80
  feature_pit = resize2d_f0(feature_pit, length)
81
  coarse_pit = feature_input.coarse_f0(feature_pit)
82
  return coarse_pit
83
 
84
 
85
+ def get_unit_pitch(in_path, tran, hubert_soft, feature_input):
86
+ soft = get_units(in_path, hubert_soft).squeeze(0).cpu().numpy()
87
+ input_pitch = transcribe(in_path, soft.shape[0], tran, feature_input)
88
+ return soft, input_pitch
89
+
90
+
91
+ def clean_pitch(input_pitch):
92
+ num_nan = np.sum(input_pitch == 1)
93
+ if num_nan / len(input_pitch) > 0.9:
94
+ input_pitch[input_pitch != 1] = 1
95
+ return input_pitch
96
+
97
+
98
+ def plt_pitch(input_pitch):
99
+ input_pitch = input_pitch.astype(float)
100
+ input_pitch[input_pitch == 1] = np.nan
101
+ return input_pitch
102
+
103
 
104
+ def f0_to_pitch(ff):
105
+ f0_pitch = 69 + 12 * np.log2(ff / 440)
106
+ return f0_pitch
107
 
 
 
 
 
108
 
109
+ def f0_plt(in_path, out_path, tran, hubert_soft, feature_input):
110
+ s1, input_pitch = get_unit_pitch(in_path, tran, hubert_soft, feature_input)
111
+ s2, output_pitch = get_unit_pitch(out_path, 0, hubert_soft, feature_input)
112
+ plt.clf()
113
+ plt.plot(plt_pitch(input_pitch), color="#66ccff")
114
+ plt.plot(plt_pitch(output_pitch), color="orange")
115
+ plt.savefig("temp.jpg")
116
+
117
+
118
+ def calc_error(in_path, out_path, tran, feature_input):
119
+ input_pitch = feature_input.compute_f0(in_path)
120
+ output_pitch = feature_input.compute_f0(out_path)
121
+ sum_y = []
122
+ if np.sum(input_pitch == 0) / len(input_pitch) > 0.9:
123
+ mistake, var_take = 0, 0
124
+ else:
125
+ for i in range(min(len(input_pitch), len(output_pitch))):
126
+ if input_pitch[i] > 0 and output_pitch[i] > 0:
127
+ sum_y.append(abs(f0_to_pitch(output_pitch[i]) - (f0_to_pitch(input_pitch[i]) + tran)))
128
+ num_y = 0
129
+ for x in sum_y:
130
+ num_y += x
131
+ len_y = len(sum_y) if len(sum_y) else 1
132
+ mistake = round(float(num_y / len_y), 2)
133
+ var_take = round(float(np.std(sum_y, ddof=1)), 2)
134
+ return mistake, var_take
135
+
136
+
137
+ def infer(source_path, speaker_id, tran, net_g_ms, hubert_soft, feature_input):
138
  sid = torch.LongTensor([int(speaker_id)]).to(dev)
139
+ soft, pitch = get_unit_pitch(source_path, tran, hubert_soft, feature_input)
140
+ pitch = torch.LongTensor(clean_pitch(pitch)).unsqueeze(0).to(dev)
 
141
  stn_tst = torch.FloatTensor(soft)
142
  with torch.no_grad():
143
  x_tst = stn_tst.unsqueeze(0).to(dev)
144
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
145
  audio = \
146
+ net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=0.3, noise_scale_w=0.5,
147
  length_scale=1)[0][
148
  0, 0].data.float().cpu().numpy()
149
+ return audio, audio.shape[-1]
 
 
 
 
 
 
 
 
 
 
 
150
 
 
 
 
 
151
 
152
+ def del_temp_wav(path_data):
153
+ for i in get_end_file(path_data, "wav"): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
154
+ os.remove(i)
155
 
 
 
156
 
157
+ def format_wav(audio_path, tar_sample):
 
 
 
 
 
 
 
 
158
  raw_audio, raw_sample_rate = torchaudio.load(audio_path)
159
+ if len(raw_audio.shape) == 2 and raw_audio.shape[1] >= 2:
160
+ raw_audio = torch.mean(raw_audio, dim=0).unsqueeze(0)
161
+ tar_audio = torchaudio.functional.resample(raw_audio, raw_sample_rate, tar_sample)
162
+ torchaudio.save(audio_path[:-4] + ".wav", tar_audio, tar_sample)
163
+ return tar_audio, tar_sample
164
 
165
 
166
  def fill_a_to_b(a, b):
167
  if len(a) < len(b):
168
  for _ in range(0, len(b) - len(a)):
169
  a.append(a[0])
170
+
171
+
172
+ def mkdir(paths: list):
173
+ for path in paths:
174
+ if not os.path.exists(path):
175
+ os.mkdir(path)
models.py CHANGED
@@ -1,15 +1,15 @@
1
- import copy
2
  import math
 
 
3
  import torch
4
  from torch import nn
 
5
  from torch.nn import functional as F
6
- import numpy as np
 
 
7
  import commons
8
  import modules
9
- import attentions
10
-
11
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
12
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
  from commons import init_weights, get_padding
14
 
15
 
@@ -189,7 +189,7 @@ class TextEncoder(nn.Module):
189
 
190
  # self.emb = nn.Embedding(n_vocab, hidden_channels)
191
  # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
192
- self.emb_pitch = nn.Embedding(128, hidden_channels)
193
  nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5)
194
 
195
  self.encoder = attentions.Encoder(
@@ -491,8 +491,8 @@ class SynthesizerTrn(nn.Module):
491
  self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
492
  gin_channels=gin_channels)
493
  self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
494
- self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers,
495
- kernel_size, p_dropout)
496
 
497
  if use_sdp:
498
  self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
@@ -504,12 +504,6 @@ class SynthesizerTrn(nn.Module):
504
 
505
  def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
506
  x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
507
- pred_pitch, pitch_embedding = self.pitch_net(x, x_mask)
508
- x = x + pitch_embedding
509
- # print(pred_pitch)
510
- gt_lf0 = torch.log(440 * (2 ** ((pitch - 69) / 12)))
511
-
512
- # print(gt_lf0)
513
  if self.n_speakers > 0:
514
  g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
515
  else:
 
 
1
  import math
2
+ import math
3
+
4
  import torch
5
  from torch import nn
6
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
7
  from torch.nn import functional as F
8
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
9
+
10
+ import attentions
11
  import commons
12
  import modules
 
 
 
 
13
  from commons import init_weights, get_padding
14
 
15
 
 
189
 
190
  # self.emb = nn.Embedding(n_vocab, hidden_channels)
191
  # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
192
+ self.emb_pitch = nn.Embedding(256, hidden_channels)
193
  nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5)
194
 
195
  self.encoder = attentions.Encoder(
 
491
  self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
492
  gin_channels=gin_channels)
493
  self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
494
+ # self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers,
495
+ # kernel_size, p_dropout)
496
 
497
  if use_sdp:
498
  self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
 
504
 
505
  def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
506
  x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
 
 
 
 
 
 
507
  if self.n_speakers > 0:
508
  g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
509
  else:
modules.py CHANGED
@@ -1,187 +1,184 @@
1
- import copy
2
  import math
3
- import numpy as np
4
- import scipy
5
  import torch
6
  from torch import nn
7
- from torch.nn import functional as F
8
-
9
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
  from torch.nn.utils import weight_norm, remove_weight_norm
11
 
12
  import commons
13
  from commons import init_weights, get_padding
14
  from transforms import piecewise_rational_quadratic_transform
15
 
16
-
17
  LRELU_SLOPE = 0.1
18
 
19
 
20
  class LayerNorm(nn.Module):
21
- def __init__(self, channels, eps=1e-5):
22
- super().__init__()
23
- self.channels = channels
24
- self.eps = eps
 
 
 
25
 
26
- self.gamma = nn.Parameter(torch.ones(channels))
27
- self.beta = nn.Parameter(torch.zeros(channels))
 
 
28
 
29
- def forward(self, x):
30
- x = x.transpose(1, -1)
31
- x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
- return x.transpose(1, -1)
33
 
34
-
35
  class ConvReluNorm(nn.Module):
36
- def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
37
- super().__init__()
38
- self.in_channels = in_channels
39
- self.hidden_channels = hidden_channels
40
- self.out_channels = out_channels
41
- self.kernel_size = kernel_size
42
- self.n_layers = n_layers
43
- self.p_dropout = p_dropout
44
- assert n_layers > 1, "Number of layers should be larger than 0."
45
-
46
- self.conv_layers = nn.ModuleList()
47
- self.norm_layers = nn.ModuleList()
48
- self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
49
- self.norm_layers.append(LayerNorm(hidden_channels))
50
- self.relu_drop = nn.Sequential(
51
- nn.ReLU(),
52
- nn.Dropout(p_dropout))
53
- for _ in range(n_layers-1):
54
- self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
55
- self.norm_layers.append(LayerNorm(hidden_channels))
56
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
57
- self.proj.weight.data.zero_()
58
- self.proj.bias.data.zero_()
59
-
60
- def forward(self, x, x_mask):
61
- x_org = x
62
- for i in range(self.n_layers):
63
- x = self.conv_layers[i](x * x_mask)
64
- x = self.norm_layers[i](x)
65
- x = self.relu_drop(x)
66
- x = x_org + self.proj(x)
67
- return x * x_mask
68
 
69
 
70
  class DDSConv(nn.Module):
71
- """
72
- Dialted and Depth-Separable Convolution
73
- """
74
- def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
75
- super().__init__()
76
- self.channels = channels
77
- self.kernel_size = kernel_size
78
- self.n_layers = n_layers
79
- self.p_dropout = p_dropout
80
-
81
- self.drop = nn.Dropout(p_dropout)
82
- self.convs_sep = nn.ModuleList()
83
- self.convs_1x1 = nn.ModuleList()
84
- self.norms_1 = nn.ModuleList()
85
- self.norms_2 = nn.ModuleList()
86
- for i in range(n_layers):
87
- dilation = kernel_size ** i
88
- padding = (kernel_size * dilation - dilation) // 2
89
- self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
90
- groups=channels, dilation=dilation, padding=padding
91
- ))
92
- self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
93
- self.norms_1.append(LayerNorm(channels))
94
- self.norms_2.append(LayerNorm(channels))
95
-
96
- def forward(self, x, x_mask, g=None):
97
- if g is not None:
98
- x = x + g
99
- for i in range(self.n_layers):
100
- y = self.convs_sep[i](x * x_mask)
101
- y = self.norms_1[i](y)
102
- y = F.gelu(y)
103
- y = self.convs_1x1[i](y)
104
- y = self.norms_2[i](y)
105
- y = F.gelu(y)
106
- y = self.drop(y)
107
- x = x + y
108
- return x * x_mask
 
109
 
110
 
111
  class WN(torch.nn.Module):
112
- def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
113
- super(WN, self).__init__()
114
- assert(kernel_size % 2 == 1)
115
- self.hidden_channels =hidden_channels
116
- self.kernel_size = kernel_size,
117
- self.dilation_rate = dilation_rate
118
- self.n_layers = n_layers
119
- self.gin_channels = gin_channels
120
- self.p_dropout = p_dropout
121
-
122
- self.in_layers = torch.nn.ModuleList()
123
- self.res_skip_layers = torch.nn.ModuleList()
124
- self.drop = nn.Dropout(p_dropout)
125
-
126
- if gin_channels != 0:
127
- cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
128
- self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
129
-
130
- for i in range(n_layers):
131
- dilation = dilation_rate ** i
132
- padding = int((kernel_size * dilation - dilation) / 2)
133
- in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
134
- dilation=dilation, padding=padding)
135
- in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
136
- self.in_layers.append(in_layer)
137
-
138
- # last one is not necessary
139
- if i < n_layers - 1:
140
- res_skip_channels = 2 * hidden_channels
141
- else:
142
- res_skip_channels = hidden_channels
143
-
144
- res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
145
- res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
146
- self.res_skip_layers.append(res_skip_layer)
147
-
148
- def forward(self, x, x_mask, g=None, **kwargs):
149
- output = torch.zeros_like(x)
150
- n_channels_tensor = torch.IntTensor([self.hidden_channels])
151
-
152
- if g is not None:
153
- g = self.cond_layer(g)
154
-
155
- for i in range(self.n_layers):
156
- x_in = self.in_layers[i](x)
157
- if g is not None:
158
- cond_offset = i * 2 * self.hidden_channels
159
- g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
160
- else:
161
- g_l = torch.zeros_like(x_in)
162
-
163
- acts = commons.fused_add_tanh_sigmoid_multiply(
164
- x_in,
165
- g_l,
166
- n_channels_tensor)
167
- acts = self.drop(acts)
168
-
169
- res_skip_acts = self.res_skip_layers[i](acts)
170
- if i < self.n_layers - 1:
171
- res_acts = res_skip_acts[:,:self.hidden_channels,:]
172
- x = (x + res_acts) * x_mask
173
- output = output + res_skip_acts[:,self.hidden_channels:,:]
174
- else:
175
- output = output + res_skip_acts
176
- return output * x_mask
177
-
178
- def remove_weight_norm(self):
179
- if self.gin_channels != 0:
180
- torch.nn.utils.remove_weight_norm(self.cond_layer)
181
- for l in self.in_layers:
182
- torch.nn.utils.remove_weight_norm(l)
183
- for l in self.res_skip_layers:
184
- torch.nn.utils.remove_weight_norm(l)
185
 
186
 
187
  class ResBlock1(torch.nn.Module):
@@ -209,11 +206,11 @@ class ResBlock1(torch.nn.Module):
209
 
210
  def forward(self, x, x_mask=None):
211
  for c1, c2 in zip(self.convs1, self.convs2):
212
- xt = F.leaky_relu(x, LRELU_SLOPE)
213
  if x_mask is not None:
214
  xt = xt * x_mask
215
  xt = c1(xt)
216
- xt = F.leaky_relu(xt, LRELU_SLOPE)
217
  if x_mask is not None:
218
  xt = xt * x_mask
219
  xt = c2(xt)
@@ -242,7 +239,7 @@ class ResBlock2(torch.nn.Module):
242
 
243
  def forward(self, x, x_mask=None):
244
  for c in self.convs:
245
- xt = F.leaky_relu(x, LRELU_SLOPE)
246
  if x_mask is not None:
247
  xt = xt * x_mask
248
  xt = c(xt)
@@ -257,134 +254,135 @@ class ResBlock2(torch.nn.Module):
257
 
258
 
259
  class Log(nn.Module):
260
- def forward(self, x, x_mask, reverse=False, **kwargs):
261
- if not reverse:
262
- y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
263
- logdet = torch.sum(-y, [1, 2])
264
- return y, logdet
265
- else:
266
- x = torch.exp(x) * x_mask
267
- return x
268
-
269
 
270
  class Flip(nn.Module):
271
- def forward(self, x, *args, reverse=False, **kwargs):
272
- x = torch.flip(x, [1])
273
- if not reverse:
274
- logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
275
- return x, logdet
276
- else:
277
- return x
278
 
279
 
280
  class ElementwiseAffine(nn.Module):
281
- def __init__(self, channels):
282
- super().__init__()
283
- self.channels = channels
284
- self.m = nn.Parameter(torch.zeros(channels,1))
285
- self.logs = nn.Parameter(torch.zeros(channels,1))
286
-
287
- def forward(self, x, x_mask, reverse=False, **kwargs):
288
- if not reverse:
289
- y = self.m + torch.exp(self.logs) * x
290
- y = y * x_mask
291
- logdet = torch.sum(self.logs * x_mask, [1,2])
292
- return y, logdet
293
- else:
294
- x = (x - self.m) * torch.exp(-self.logs) * x_mask
295
- return x
296
 
297
 
298
  class ResidualCouplingLayer(nn.Module):
299
- def __init__(self,
300
- channels,
301
- hidden_channels,
302
- kernel_size,
303
- dilation_rate,
304
- n_layers,
305
- p_dropout=0,
306
- gin_channels=0,
307
- mean_only=False):
308
- assert channels % 2 == 0, "channels should be divisible by 2"
309
- super().__init__()
310
- self.channels = channels
311
- self.hidden_channels = hidden_channels
312
- self.kernel_size = kernel_size
313
- self.dilation_rate = dilation_rate
314
- self.n_layers = n_layers
315
- self.half_channels = channels // 2
316
- self.mean_only = mean_only
317
-
318
- self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
319
- self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
320
- self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
321
- self.post.weight.data.zero_()
322
- self.post.bias.data.zero_()
323
-
324
- def forward(self, x, x_mask, g=None, reverse=False):
325
- x0, x1 = torch.split(x, [self.half_channels]*2, 1)
326
- h = self.pre(x0) * x_mask
327
- h = self.enc(h, x_mask, g=g)
328
- stats = self.post(h) * x_mask
329
- if not self.mean_only:
330
- m, logs = torch.split(stats, [self.half_channels]*2, 1)
331
- else:
332
- m = stats
333
- logs = torch.zeros_like(m)
334
-
335
- if not reverse:
336
- x1 = m + x1 * torch.exp(logs) * x_mask
337
- x = torch.cat([x0, x1], 1)
338
- logdet = torch.sum(logs, [1,2])
339
- return x, logdet
340
- else:
341
- x1 = (x1 - m) * torch.exp(-logs) * x_mask
342
- x = torch.cat([x0, x1], 1)
343
- return x
 
344
 
345
 
346
  class ConvFlow(nn.Module):
347
- def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
348
- super().__init__()
349
- self.in_channels = in_channels
350
- self.filter_channels = filter_channels
351
- self.kernel_size = kernel_size
352
- self.n_layers = n_layers
353
- self.num_bins = num_bins
354
- self.tail_bound = tail_bound
355
- self.half_channels = in_channels // 2
356
-
357
- self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
358
- self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
359
- self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
360
- self.proj.weight.data.zero_()
361
- self.proj.bias.data.zero_()
362
-
363
- def forward(self, x, x_mask, g=None, reverse=False):
364
- x0, x1 = torch.split(x, [self.half_channels]*2, 1)
365
- h = self.pre(x0)
366
- h = self.convs(h, x_mask, g=g)
367
- h = self.proj(h) * x_mask
368
-
369
- b, c, t = x0.shape
370
- h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
371
-
372
- unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
373
- unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
374
- unnormalized_derivatives = h[..., 2 * self.num_bins:]
375
-
376
- x1, logabsdet = piecewise_rational_quadratic_transform(x1,
377
- unnormalized_widths,
378
- unnormalized_heights,
379
- unnormalized_derivatives,
380
- inverse=reverse,
381
- tails='linear',
382
- tail_bound=self.tail_bound
383
- )
384
-
385
- x = torch.cat([x0, x1], 1) * x_mask
386
- logdet = torch.sum(logabsdet * x_mask, [1,2])
387
- if not reverse:
388
- return x, logdet
389
- else:
390
- return x
 
 
1
  import math
2
+
 
3
  import torch
4
  from torch import nn
5
+ from torch.nn import Conv1d
6
+ from torch.nn import functional as t_func
 
7
  from torch.nn.utils import weight_norm, remove_weight_norm
8
 
9
  import commons
10
  from commons import init_weights, get_padding
11
  from transforms import piecewise_rational_quadratic_transform
12
 
 
13
  LRELU_SLOPE = 0.1
14
 
15
 
16
  class LayerNorm(nn.Module):
17
+ def __init__(self, channels, eps=1e-5):
18
+ super().__init__()
19
+ self.channels = channels
20
+ self.eps = eps
21
+
22
+ self.gamma = nn.Parameter(torch.ones(channels))
23
+ self.beta = nn.Parameter(torch.zeros(channels))
24
 
25
+ def forward(self, x):
26
+ x = x.transpose(1, -1)
27
+ x = t_func.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
28
+ return x.transpose(1, -1)
29
 
 
 
 
 
30
 
 
31
  class ConvReluNorm(nn.Module):
32
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
33
+ super().__init__()
34
+ self.in_channels = in_channels
35
+ self.hidden_channels = hidden_channels
36
+ self.out_channels = out_channels
37
+ self.kernel_size = kernel_size
38
+ self.n_layers = n_layers
39
+ self.p_dropout = p_dropout
40
+ assert n_layers > 1, "Number of layers should be larger than 0."
41
+
42
+ self.conv_layers = nn.ModuleList()
43
+ self.norm_layers = nn.ModuleList()
44
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
45
+ self.norm_layers.append(LayerNorm(hidden_channels))
46
+ self.relu_drop = nn.Sequential(
47
+ nn.ReLU(),
48
+ nn.Dropout(p_dropout))
49
+ for _ in range(n_layers - 1):
50
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
51
+ self.norm_layers.append(LayerNorm(hidden_channels))
52
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
53
+ self.proj.weight.data.zero_()
54
+ self.proj.bias.data.zero_()
55
+
56
+ def forward(self, x, x_mask):
57
+ x_org = x
58
+ for i in range(self.n_layers):
59
+ x = self.conv_layers[i](x * x_mask)
60
+ x = self.norm_layers[i](x)
61
+ x = self.relu_drop(x)
62
+ x = x_org + self.proj(x)
63
+ return x * x_mask
64
 
65
 
66
  class DDSConv(nn.Module):
67
+ """
68
+ Dialted and Depth-Separable Convolution
69
+ """
70
+
71
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
72
+ super().__init__()
73
+ self.channels = channels
74
+ self.kernel_size = kernel_size
75
+ self.n_layers = n_layers
76
+ self.p_dropout = p_dropout
77
+
78
+ self.drop = nn.Dropout(p_dropout)
79
+ self.convs_sep = nn.ModuleList()
80
+ self.convs_1x1 = nn.ModuleList()
81
+ self.norms_1 = nn.ModuleList()
82
+ self.norms_2 = nn.ModuleList()
83
+ for i in range(n_layers):
84
+ dilation = kernel_size ** i
85
+ padding = (kernel_size * dilation - dilation) // 2
86
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
87
+ groups=channels, dilation=dilation, padding=padding
88
+ ))
89
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
90
+ self.norms_1.append(LayerNorm(channels))
91
+ self.norms_2.append(LayerNorm(channels))
92
+
93
+ def forward(self, x, x_mask, g=None):
94
+ if g is not None:
95
+ x = x + g
96
+ for i in range(self.n_layers):
97
+ y = self.convs_sep[i](x * x_mask)
98
+ y = self.norms_1[i](y)
99
+ y = t_func.gelu(y)
100
+ y = self.convs_1x1[i](y)
101
+ y = self.norms_2[i](y)
102
+ y = t_func.gelu(y)
103
+ y = self.drop(y)
104
+ x = x + y
105
+ return x * x_mask
106
 
107
 
108
  class WN(torch.nn.Module):
109
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
110
+ super(WN, self).__init__()
111
+ assert (kernel_size % 2 == 1)
112
+ self.hidden_channels = hidden_channels
113
+ self.kernel_size = kernel_size,
114
+ self.dilation_rate = dilation_rate
115
+ self.n_layers = n_layers
116
+ self.gin_channels = gin_channels
117
+ self.p_dropout = p_dropout
118
+
119
+ self.in_layers = torch.nn.ModuleList()
120
+ self.res_skip_layers = torch.nn.ModuleList()
121
+ self.drop = nn.Dropout(p_dropout)
122
+
123
+ if gin_channels != 0:
124
+ cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
125
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
126
+
127
+ for i in range(n_layers):
128
+ dilation = dilation_rate ** i
129
+ padding = int((kernel_size * dilation - dilation) / 2)
130
+ in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
131
+ dilation=dilation, padding=padding)
132
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
133
+ self.in_layers.append(in_layer)
134
+
135
+ # last one is not necessary
136
+ if i < n_layers - 1:
137
+ res_skip_channels = 2 * hidden_channels
138
+ else:
139
+ res_skip_channels = hidden_channels
140
+
141
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
142
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
143
+ self.res_skip_layers.append(res_skip_layer)
144
+
145
+ def forward(self, x, x_mask, g=None, **kwargs):
146
+ output = torch.zeros_like(x)
147
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
148
+
149
+ if g is not None:
150
+ g = self.cond_layer(g)
151
+
152
+ for i in range(self.n_layers):
153
+ x_in = self.in_layers[i](x)
154
+ if g is not None:
155
+ cond_offset = i * 2 * self.hidden_channels
156
+ g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
157
+ else:
158
+ g_l = torch.zeros_like(x_in)
159
+
160
+ acts = commons.fused_add_tanh_sigmoid_multiply(
161
+ x_in,
162
+ g_l,
163
+ n_channels_tensor)
164
+ acts = self.drop(acts)
165
+
166
+ res_skip_acts = self.res_skip_layers[i](acts)
167
+ if i < self.n_layers - 1:
168
+ res_acts = res_skip_acts[:, :self.hidden_channels, :]
169
+ x = (x + res_acts) * x_mask
170
+ output = output + res_skip_acts[:, self.hidden_channels:, :]
171
+ else:
172
+ output = output + res_skip_acts
173
+ return output * x_mask
174
+
175
+ def remove_weight_norm(self):
176
+ if self.gin_channels != 0:
177
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
178
+ for l in self.in_layers:
179
+ torch.nn.utils.remove_weight_norm(l)
180
+ for l in self.res_skip_layers:
181
+ torch.nn.utils.remove_weight_norm(l)
182
 
183
 
184
  class ResBlock1(torch.nn.Module):
 
206
 
207
  def forward(self, x, x_mask=None):
208
  for c1, c2 in zip(self.convs1, self.convs2):
209
+ xt = t_func.leaky_relu(x, LRELU_SLOPE)
210
  if x_mask is not None:
211
  xt = xt * x_mask
212
  xt = c1(xt)
213
+ xt = t_func.leaky_relu(xt, LRELU_SLOPE)
214
  if x_mask is not None:
215
  xt = xt * x_mask
216
  xt = c2(xt)
 
239
 
240
  def forward(self, x, x_mask=None):
241
  for c in self.convs:
242
+ xt = t_func.leaky_relu(x, LRELU_SLOPE)
243
  if x_mask is not None:
244
  xt = xt * x_mask
245
  xt = c(xt)
 
254
 
255
 
256
  class Log(nn.Module):
257
+ def forward(self, x, x_mask, reverse=False, **kwargs):
258
+ if not reverse:
259
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
260
+ logdet = torch.sum(-y, [1, 2])
261
+ return y, logdet
262
+ else:
263
+ x = torch.exp(x) * x_mask
264
+ return x
265
+
266
 
267
  class Flip(nn.Module):
268
+ def forward(self, x, *args, reverse=False, **kwargs):
269
+ x = torch.flip(x, [1])
270
+ if not reverse:
271
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
272
+ return x, logdet
273
+ else:
274
+ return x
275
 
276
 
277
  class ElementwiseAffine(nn.Module):
278
+ def __init__(self, channels):
279
+ super().__init__()
280
+ self.channels = channels
281
+ self.m = nn.Parameter(torch.zeros(channels, 1))
282
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
283
+
284
+ def forward(self, x, x_mask, reverse=False, **kwargs):
285
+ if not reverse:
286
+ y = self.m + torch.exp(self.logs) * x
287
+ y = y * x_mask
288
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
289
+ return y, logdet
290
+ else:
291
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
292
+ return x
293
 
294
 
295
  class ResidualCouplingLayer(nn.Module):
296
+ def __init__(self,
297
+ channels,
298
+ hidden_channels,
299
+ kernel_size,
300
+ dilation_rate,
301
+ n_layers,
302
+ p_dropout=0,
303
+ gin_channels=0,
304
+ mean_only=False):
305
+ assert channels % 2 == 0, "channels should be divisible by 2"
306
+ super().__init__()
307
+ self.channels = channels
308
+ self.hidden_channels = hidden_channels
309
+ self.kernel_size = kernel_size
310
+ self.dilation_rate = dilation_rate
311
+ self.n_layers = n_layers
312
+ self.half_channels = channels // 2
313
+ self.mean_only = mean_only
314
+
315
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
316
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
317
+ gin_channels=gin_channels)
318
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
319
+ self.post.weight.data.zero_()
320
+ self.post.bias.data.zero_()
321
+
322
+ def forward(self, x, x_mask, g=None, reverse=False):
323
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
324
+ h = self.pre(x0) * x_mask
325
+ h = self.enc(h, x_mask, g=g)
326
+ stats = self.post(h) * x_mask
327
+ if not self.mean_only:
328
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
329
+ else:
330
+ m = stats
331
+ logs = torch.zeros_like(m)
332
+
333
+ if not reverse:
334
+ x1 = m + x1 * torch.exp(logs) * x_mask
335
+ x = torch.cat([x0, x1], 1)
336
+ logdet = torch.sum(logs, [1, 2])
337
+ return x, logdet
338
+ else:
339
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
340
+ x = torch.cat([x0, x1], 1)
341
+ return x
342
 
343
 
344
  class ConvFlow(nn.Module):
345
+ def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
346
+ super().__init__()
347
+ self.in_channels = in_channels
348
+ self.filter_channels = filter_channels
349
+ self.kernel_size = kernel_size
350
+ self.n_layers = n_layers
351
+ self.num_bins = num_bins
352
+ self.tail_bound = tail_bound
353
+ self.half_channels = in_channels // 2
354
+
355
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
356
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
357
+ self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
358
+ self.proj.weight.data.zero_()
359
+ self.proj.bias.data.zero_()
360
+
361
+ def forward(self, x, x_mask, g=None, reverse=False):
362
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
363
+ h = self.pre(x0)
364
+ h = self.convs(h, x_mask, g=g)
365
+ h = self.proj(h) * x_mask
366
+
367
+ b, c, t = x0.shape
368
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
369
+
370
+ unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
371
+ unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
372
+ unnormalized_derivatives = h[..., 2 * self.num_bins:]
373
+
374
+ x1, logabsdet = piecewise_rational_quadratic_transform(x1,
375
+ unnormalized_widths,
376
+ unnormalized_heights,
377
+ unnormalized_derivatives,
378
+ inverse=reverse,
379
+ tails='linear',
380
+ tail_bound=self.tail_bound
381
+ )
382
+
383
+ x = torch.cat([x0, x1], 1) * x_mask
384
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
385
+ if not reverse:
386
+ return x, logdet
387
+ else:
388
+ return x
preprocess_wave.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import librosa
4
+ import numpy as np
5
+ import pyworld
6
+ from scipy.io import wavfile
7
+
8
+ import utils
9
+
10
+
11
+ class FeatureInput(object):
12
+ def __init__(self, samplerate=16000, hop_size=160):
13
+ self.fs = samplerate
14
+ self.hop = hop_size
15
+
16
+ self.f0_bin = 256
17
+ self.f0_max = 1100.0
18
+ self.f0_min = 50.0
19
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
20
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
21
+
22
+ def compute_f0(self, path):
23
+ x, sr = librosa.load(path, sr=self.fs)
24
+ assert sr == self.fs
25
+ f0, t = pyworld.dio(
26
+ x.astype(np.double),
27
+ fs=sr,
28
+ f0_ceil=800,
29
+ frame_period=1000 * self.hop / sr,
30
+ )
31
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
32
+ for index, pitch in enumerate(f0):
33
+ f0[index] = round(pitch, 1)
34
+ return f0
35
+
36
+ # for numpy # code from diffsinger
37
+ def coarse_f0(self, f0):
38
+ f0_mel = 1127 * np.log(1 + f0 / 700)
39
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
40
+ self.f0_bin - 2
41
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
42
+
43
+ # use 0 or 1
44
+ f0_mel[f0_mel <= 1] = 1
45
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
46
+ f0_coarse = np.rint(f0_mel).astype(np.int)
47
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
48
+ f0_coarse.max(),
49
+ f0_coarse.min(),
50
+ )
51
+ return f0_coarse
52
+
53
+ # for tensor # code from diffsinger
54
+ def coarse_f0_ts(self, f0):
55
+ f0_mel = 1127 * (1 + f0 / 700).log()
56
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
57
+ self.f0_bin - 2
58
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
59
+
60
+ # use 0 or 1
61
+ f0_mel[f0_mel <= 1] = 1
62
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
63
+ f0_coarse = (f0_mel + 0.5).long()
64
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
65
+ f0_coarse.max(),
66
+ f0_coarse.min(),
67
+ )
68
+ return f0_coarse
69
+
70
+ def save_wav(self, wav, path):
71
+ wav *= 32767 / max(0.01, np.max(np.abs(wav))) * 0.6
72
+ wavfile.write(path, self.fs, wav.astype(np.int16))
73
+
74
+
75
+ if __name__ == "__main__":
76
+ wavPath = "./data/waves"
77
+ outPath = "./data/label"
78
+ if not os.path.exists("./data/label"):
79
+ os.mkdir("./data/label")
80
+
81
+ # define model and load checkpoint
82
+ hps = utils.get_hparams_from_file("./configs/singing_base.json")
83
+ featureInput = FeatureInput(hps.data.sampling_rate, hps.data.hop_length)
84
+ vits_file = open("./filelists/vc_file.txt", "w", encoding="utf-8")
85
+
86
+ for spks in os.listdir(wavPath):
87
+ if os.path.isdir(f"./{wavPath}/{spks}"):
88
+ os.makedirs(f"./{outPath}/{spks}")
89
+ for file in os.listdir(f"./{wavPath}/{spks}"):
90
+ if file.endswith(".wav"):
91
+ file = file[:-4]
92
+ audio_path = f"./{wavPath}/{spks}/{file}.wav"
93
+ featur_pit = featureInput.compute_f0(audio_path)
94
+ coarse_pit = featureInput.coarse_f0(featur_pit)
95
+ np.save(
96
+ f"{outPath}/{spks}/{file}_pitch.npy",
97
+ coarse_pit,
98
+ allow_pickle=False,
99
+ )
100
+ np.save(
101
+ f"{outPath}/{spks}/{file}_nsff0.npy",
102
+ featur_pit,
103
+ allow_pickle=False,
104
+ )
105
+
106
+ path_audio = f"./data/waves/{spks}/{file}.wav"
107
+ path_spkid = f"./data/spkid/{spks}.npy"
108
+ path_label = (
109
+ f"./data/phone/{spks}/{file}.npy" # phone means ppg & hubert
110
+ )
111
+ path_pitch = f"./data/label/{spks}/{file}_pitch.npy"
112
+ path_nsff0 = f"./data/label/{spks}/{file}_nsff0.npy"
113
+ print(
114
+ f"{path_audio}|{path_spkid}|{path_label}|{path_pitch}|{path_nsff0}",
115
+ file=vits_file,
116
+ )
117
+
118
+ vits_file.close()
requirements.txt CHANGED
@@ -1,16 +1,16 @@
1
  Cython==0.29.21
2
  librosa==0.8.0
3
- matplotlib
4
- phonemizer
5
- scipy
 
6
  torch
7
  torchvision
8
- Unidecode
9
  torchaudio
10
  pyworld
 
11
  keras
12
  mir-eval
13
  pretty-midi
14
- tensorflow
15
- numpy
16
- pydub
 
1
  Cython==0.29.21
2
  librosa==0.8.0
3
+ matplotlib==3.3.1
4
+ numpy==1.18.5
5
+ phonemizer==2.2.1
6
+ scipy==1.5.2
7
  torch
8
  torchvision
9
+ Unidecode==1.1.1
10
  torchaudio
11
  pyworld
12
+ scipy
13
  keras
14
  mir-eval
15
  pretty-midi
16
+ pydub
 
 
text/LICENSE DELETED
@@ -1,19 +0,0 @@
1
- Copyright (c) 2017 Keith Ito
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining a copy
4
- of this software and associated documentation files (the "Software"), to deal
5
- in the Software without restriction, including without limitation the rights
6
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
- copies of the Software, and to permit persons to whom the Software is
8
- furnished to do so, subject to the following conditions:
9
-
10
- The above copyright notice and this permission notice shall be included in
11
- all copies or substantial portions of the Software.
12
-
13
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
- THE SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/__init__.py DELETED
@@ -1,54 +0,0 @@
1
- """ from https://github.com/keithito/tacotron """
2
- from text import cleaners
3
- from text.symbols import symbols
4
-
5
-
6
- # Mappings from symbol to numeric ID and vice versa:
7
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
- _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
-
10
-
11
- def text_to_sequence(text, cleaner_names):
12
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
- Args:
14
- text: string to convert to a sequence
15
- cleaner_names: names of the cleaner functions to run the text through
16
- Returns:
17
- List of integers corresponding to the symbols in the text
18
- '''
19
- sequence = []
20
-
21
- clean_text = _clean_text(text, cleaner_names)
22
- for symbol in clean_text:
23
- symbol_id = _symbol_to_id[symbol]
24
- sequence += [symbol_id]
25
- return sequence
26
-
27
-
28
- def cleaned_text_to_sequence(cleaned_text):
29
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
30
- Args:
31
- text: string to convert to a sequence
32
- Returns:
33
- List of integers corresponding to the symbols in the text
34
- '''
35
- sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
36
- return sequence
37
-
38
-
39
- def sequence_to_text(sequence):
40
- '''Converts a sequence of IDs back to a string'''
41
- result = ''
42
- for symbol_id in sequence:
43
- s = _id_to_symbol[symbol_id]
44
- result += s
45
- return result
46
-
47
-
48
- def _clean_text(text, cleaner_names):
49
- for name in cleaner_names:
50
- cleaner = getattr(cleaners, name)
51
- if not cleaner:
52
- raise Exception('Unknown cleaner: %s' % name)
53
- text = cleaner(text)
54
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/cleaners.py DELETED
@@ -1,100 +0,0 @@
1
- """ from https://github.com/keithito/tacotron """
2
-
3
- '''
4
- Cleaners are transformations that run over the input text at both training and eval time.
5
-
6
- Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
- hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
- 1. "english_cleaners" for English text
9
- 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
- the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
- 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
- the symbols in symbols.py to match your data).
13
- '''
14
-
15
- import re
16
- from unidecode import unidecode
17
- from phonemizer import phonemize
18
-
19
-
20
- # Regular expression matching whitespace:
21
- _whitespace_re = re.compile(r'\s+')
22
-
23
- # List of (regular expression, replacement) pairs for abbreviations:
24
- _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
25
- ('mrs', 'misess'),
26
- ('mr', 'mister'),
27
- ('dr', 'doctor'),
28
- ('st', 'saint'),
29
- ('co', 'company'),
30
- ('jr', 'junior'),
31
- ('maj', 'major'),
32
- ('gen', 'general'),
33
- ('drs', 'doctors'),
34
- ('rev', 'reverend'),
35
- ('lt', 'lieutenant'),
36
- ('hon', 'honorable'),
37
- ('sgt', 'sergeant'),
38
- ('capt', 'captain'),
39
- ('esq', 'esquire'),
40
- ('ltd', 'limited'),
41
- ('col', 'colonel'),
42
- ('ft', 'fort'),
43
- ]]
44
-
45
-
46
- def expand_abbreviations(text):
47
- for regex, replacement in _abbreviations:
48
- text = re.sub(regex, replacement, text)
49
- return text
50
-
51
-
52
- def expand_numbers(text):
53
- return normalize_numbers(text)
54
-
55
-
56
- def lowercase(text):
57
- return text.lower()
58
-
59
-
60
- def collapse_whitespace(text):
61
- return re.sub(_whitespace_re, ' ', text)
62
-
63
-
64
- def convert_to_ascii(text):
65
- return unidecode(text)
66
-
67
-
68
- def basic_cleaners(text):
69
- '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70
- text = lowercase(text)
71
- text = collapse_whitespace(text)
72
- return text
73
-
74
-
75
- def transliteration_cleaners(text):
76
- '''Pipeline for non-English text that transliterates to ASCII.'''
77
- text = convert_to_ascii(text)
78
- text = lowercase(text)
79
- text = collapse_whitespace(text)
80
- return text
81
-
82
-
83
- def english_cleaners(text):
84
- '''Pipeline for English text, including abbreviation expansion.'''
85
- text = convert_to_ascii(text)
86
- text = lowercase(text)
87
- text = expand_abbreviations(text)
88
- phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
89
- phonemes = collapse_whitespace(phonemes)
90
- return phonemes
91
-
92
-
93
- def english_cleaners2(text):
94
- '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
95
- text = convert_to_ascii(text)
96
- text = lowercase(text)
97
- text = expand_abbreviations(text)
98
- phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
99
- phonemes = collapse_whitespace(phonemes)
100
- return phonemes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/symbols.py DELETED
@@ -1,16 +0,0 @@
1
- """ from https://github.com/keithito/tacotron """
2
-
3
- '''
4
- Defines the set of symbols used in text input to the model.
5
- '''
6
- _pad = '_'
7
- _punctuation = ';:,.!?¡¿—…"«»“” '
8
- _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
9
- _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10
-
11
-
12
- # Export all symbols:
13
- symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
14
-
15
- # Special symbol ids
16
- SPACE_ID = symbols.index(" ")