Spaces:
Runtime error
Runtime error
..
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +0 -32
- LICENSE +0 -21
- README.md +4 -4
- app.py +60 -66
- attentions.py +294 -286
- commons.py +99 -100
- configs/yilanqiu.json → config.json +15 -10
- configs/nyarumul.json +0 -53
- configs/nyarusing.json +0 -52
- data.py +0 -36
- data_utils.py +12 -14
- hubert/__init__.py +0 -8
- hubert/__pycache__/__init__.cpython-38.pyc +0 -0
- hubert/__pycache__/model.cpython-38.pyc +0 -0
- hubert/dataset.py +0 -91
- hubert/utils.py +0 -58
- hubert/model.py → hubert_model.py +25 -91
- icassp2022_vocal_transcription/.gitignore +0 -3
- icassp2022_vocal_transcription/README.md +0 -56
- icassp2022_vocal_transcription/__init__.py +0 -3
- icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc +0 -0
- icassp2022_vocal_transcription/data/weight_ST.hdf5 +0 -3
- icassp2022_vocal_transcription/data/x_train_mean.npy +0 -3
- icassp2022_vocal_transcription/data/x_train_std.npy +0 -3
- icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png +0 -0
- icassp2022_vocal_transcription/img/example_pop1_midi.png +0 -0
- icassp2022_vocal_transcription/requirements.txt +0 -8
- icassp2022_vocal_transcription/src/MIDI.py +0 -141
- icassp2022_vocal_transcription/src/__init__.py +0 -0
- icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc +0 -0
- icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc +0 -0
- icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc +0 -0
- icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc +0 -0
- icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc +0 -0
- icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc +0 -0
- icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc +0 -0
- icassp2022_vocal_transcription/src/featureExtraction.py +0 -61
- icassp2022_vocal_transcription/src/model.py +0 -139
- icassp2022_vocal_transcription/src/quantization.py +0 -217
- icassp2022_vocal_transcription/src/singing_transcription.py +0 -147
- icassp2022_vocal_transcription/src/utils.py +0 -49
- infer_tool.py +132 -57
- models.py +9 -15
- modules.py +282 -284
- preprocess_wave.py +118 -0
- requirements.txt +7 -7
- text/LICENSE +0 -19
- text/__init__.py +0 -54
- text/cleaners.py +0 -100
- text/symbols.py +0 -16
.gitattributes
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
icassp2022_vocal_transcription/data/weight_ST.hdf5 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LICENSE
DELETED
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
MIT License
|
| 2 |
-
|
| 3 |
-
Copyright (c) 2021 Jaehyeon Kim
|
| 4 |
-
|
| 5 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
-
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
-
in the Software without restriction, including without limitation the rights
|
| 8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
-
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
-
furnished to do so, subject to the following conditions:
|
| 11 |
-
|
| 12 |
-
The above copyright notice and this permission notice shall be included in all
|
| 13 |
-
copies or substantial portions of the Software.
|
| 14 |
-
|
| 15 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
-
title: Sovits
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 3.4
|
| 8 |
app_file: app.py
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Sovits F0
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 3.4
|
| 8 |
app_file: app.py
|
app.py
CHANGED
|
@@ -1,77 +1,45 @@
|
|
| 1 |
-
import
|
| 2 |
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
import torch
|
| 5 |
-
import torchaudio
|
| 6 |
|
| 7 |
-
import hubert
|
| 8 |
-
import icassp2022_vocal_transcription
|
| 9 |
import infer_tool
|
| 10 |
-
import utils
|
| 11 |
-
from models import SynthesizerTrn
|
| 12 |
|
| 13 |
-
dev = torch.device("cpu")
|
| 14 |
-
numba_logger = logging.getLogger('numba')
|
| 15 |
-
numba_logger.setLevel(logging.WARNING)
|
| 16 |
convert_cnt = [0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
n_speakers=hps_ms.data.n_speakers,
|
| 24 |
-
**hps_ms.model)
|
| 25 |
-
|
| 26 |
-
hubert_soft = hubert.hubert_soft('hubert.pt')
|
| 27 |
-
_ = utils.load_checkpoint("1121_epochs.pth", net_g_ms, None)
|
| 28 |
-
_ = net_g_ms.eval().to(dev)
|
| 29 |
|
| 30 |
|
| 31 |
def vc_fn(sid, audio_record, audio_upload, tran):
|
|
|
|
| 32 |
if audio_upload is not None:
|
| 33 |
audio_path = audio_upload
|
| 34 |
elif audio_record is not None:
|
| 35 |
audio_path = audio_record
|
| 36 |
else:
|
| 37 |
-
return "你需要上传wav
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
audio, sampling_rate = torchaudio.load(audio_path)
|
| 41 |
duration = audio.shape[0] / sampling_rate
|
| 42 |
-
if duration >
|
| 43 |
-
return "请上传小于
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
|
| 53 |
-
|
| 54 |
-
sid = torch.LongTensor([2]).to(dev) if sid == "" else torch.LongTensor([1]).to(dev)
|
| 55 |
-
stn_tst = torch.FloatTensor(soft)
|
| 56 |
-
with torch.no_grad():
|
| 57 |
-
x_tst = stn_tst.unsqueeze(0).to(dev)
|
| 58 |
-
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
|
| 59 |
-
audio = net_g_ms.infer(x_tst, x_tst_lengths, pitch=pitch, sid=sid, noise_scale=0.3,
|
| 60 |
-
noise_scale_w=0.1, length_scale=1)[0][0, 0].data.float().cpu().numpy()
|
| 61 |
-
convert_cnt[0] += 1
|
| 62 |
-
print(convert_cnt[0])
|
| 63 |
-
return "Success", (hps_ms.data.sampling_rate, audio)
|
| 64 |
-
|
| 65 |
-
character_dict = {
|
| 66 |
-
"夜刀神十香": 1,
|
| 67 |
-
"鸢一折纸": 2,
|
| 68 |
-
"时崎狂三": 3,
|
| 69 |
-
"冰芽川四糸乃": 4,
|
| 70 |
-
"五河琴里": 5,
|
| 71 |
-
"八舞夕弦": 6,
|
| 72 |
-
"八舞耶俱矢": 7,
|
| 73 |
-
"诱宵美九": 8,
|
| 74 |
-
}
|
| 75 |
|
| 76 |
|
| 77 |
app = gr.Blocks()
|
|
@@ -79,26 +47,52 @@ with app:
|
|
| 79 |
with gr.Tabs():
|
| 80 |
with gr.TabItem("Basic"):
|
| 81 |
gr.Markdown(value="""
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
|
| 85 |
|
| 86 |
-
|
| 87 |
|
| 88 |
-
|
| 89 |
|
| 90 |
-
|
| 91 |
|
| 92 |
-
如果想自己制作并训练模型可以访问这个 [github仓库](https://github.com/IceKyrin/sovits_guide)
|
| 93 |
""")
|
| 94 |
-
speaker_id = gr.Dropdown(label="音色", choices=
|
| 95 |
record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
|
| 96 |
upload_input = gr.Audio(source="upload", label="上传音频(长度小于45秒)", type="filepath",
|
| 97 |
elem_id="audio_inputs")
|
| 98 |
-
vc_transform = gr.Number(label="
|
| 99 |
vc_submit = gr.Button("转换", variant="primary")
|
| 100 |
out_message = gr.Textbox(label="Output Message")
|
| 101 |
out_audio = gr.Audio(label="Output Audio")
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
|
|
|
| 104 |
app.launch()
|
|
|
|
| 1 |
+
import time
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
+
import soundfile
|
| 5 |
import torch
|
|
|
|
| 6 |
|
|
|
|
|
|
|
| 7 |
import infer_tool
|
|
|
|
|
|
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
convert_cnt = [0]
|
| 10 |
+
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 11 |
+
model_name = "152_epochs.pth"
|
| 12 |
+
config_name = "nyarumul.json"
|
| 13 |
+
net_g_ms, hubert_soft, feature_input, hps_ms = infer_tool.load_model(f"{model_name}", f"configs/{config_name}")
|
| 14 |
|
| 15 |
+
# 获取config参数
|
| 16 |
+
target_sample = hps_ms.data.sampling_rate
|
| 17 |
+
spk_dict = {
|
| 18 |
+
"奕兰秋": 4
|
| 19 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
def vc_fn(sid, audio_record, audio_upload, tran):
|
| 23 |
+
print(sid, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
|
| 24 |
if audio_upload is not None:
|
| 25 |
audio_path = audio_upload
|
| 26 |
elif audio_record is not None:
|
| 27 |
audio_path = audio_record
|
| 28 |
else:
|
| 29 |
+
return "你需要上传wav文件或使用网页内置的录音!", None
|
| 30 |
+
|
| 31 |
+
audio, sampling_rate = infer_tool.format_wav(audio_path, target_sample)
|
|
|
|
| 32 |
duration = audio.shape[0] / sampling_rate
|
| 33 |
+
if duration > 60:
|
| 34 |
+
return "请上传小于60s的音频,需要转换长音频请使用colab", None
|
| 35 |
+
|
| 36 |
+
o_audio, out_sr = infer_tool.infer(audio_path, spk_dict[sid], tran, net_g_ms, hubert_soft, feature_input)
|
| 37 |
+
out_path = f"./out_temp.wav"
|
| 38 |
+
soundfile.write(out_path, o_audio, target_sample)
|
| 39 |
+
infer_tool.f0_plt(audio_path, out_path, tran, hubert_soft, feature_input)
|
| 40 |
+
mistake, var = infer_tool.calc_error(audio_path, out_path, tran, feature_input)
|
| 41 |
+
return f"分段误差参考:0.3优秀,0.5左右合理,少量0.8-1可以接受\n若偏差过大,请调整升降半音数;多次调整均过大、说明超出歌手音域\n半音偏差:{mistake}\n半音方差:{var}", (
|
| 42 |
+
target_sample, o_audio), gr.Image.update("temp.jpg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
app = gr.Blocks()
|
|
|
|
| 47 |
with gr.Tabs():
|
| 48 |
with gr.TabItem("Basic"):
|
| 49 |
gr.Markdown(value="""
|
| 50 |
+
本音源有授权,二创不创死主播即可。[其他音色体验](https://huggingface.co/spaces/innnky/nyaru-svc2.0-advanced)
|
| 51 |
+
|
| 52 |
+
本模型为sovits_f0,支持**60s以内**的**无伴奏**wav、mp3格式,或使用**网页内置**的录音(二选一)
|
| 53 |
+
|
| 54 |
+
**error就用格式工厂自行转换为wav再上传**
|
| 55 |
+
|
| 56 |
+
转换效果取决于源音频语气、节奏是否与目标音色相近。
|
| 57 |
|
| 58 |
+
源音频为女声时,**建议降3-6key**,**最后的输出误差越接近0,音准越高**
|
| 59 |
|
| 60 |
+
源音频为**低音男声**时,**建议升3key,具���看曲线图情况**
|
| 61 |
|
| 62 |
+
f0曲线可以直观的显示跑调情况,蓝色为输入音高,橙色为合成音频的音高
|
| 63 |
|
| 64 |
+
若**只看见橙色**,说明蓝色曲线被覆盖,转换效果较好
|
| 65 |
|
|
|
|
| 66 |
""")
|
| 67 |
+
speaker_id = gr.Dropdown(label="音色", choices=["奕兰秋"], value="奕兰秋")
|
| 68 |
record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
|
| 69 |
upload_input = gr.Audio(source="upload", label="上传音频(长度小于45秒)", type="filepath",
|
| 70 |
elem_id="audio_inputs")
|
| 71 |
+
vc_transform = gr.Number(label="升降半音(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
| 72 |
vc_submit = gr.Button("转换", variant="primary")
|
| 73 |
out_message = gr.Textbox(label="Output Message")
|
| 74 |
out_audio = gr.Audio(label="Output Audio")
|
| 75 |
+
f0_image = gr.Image(label="f0曲线")
|
| 76 |
+
vc_submit.click(vc_fn, [speaker_id, record_input, upload_input, vc_transform],
|
| 77 |
+
[out_message, out_audio, f0_image])
|
| 78 |
+
with gr.TabItem("使用说明"):
|
| 79 |
+
gr.Markdown(value="""
|
| 80 |
+
0、合集:https://github.com/IceKyrin/sovits_guide/blob/main/README.md
|
| 81 |
+
|
| 82 |
+
1、仅支持sovit_f0(sovits2.0)模型
|
| 83 |
+
|
| 84 |
+
2、自行下载hubert-soft-0d54a1f4.pt改名为hubert.pt(已经下好了)
|
| 85 |
+
https://github.com/bshall/hubert/releases/tag/v0.1
|
| 86 |
+
|
| 87 |
+
3、pth文件夹下放置sovits2.0的模型
|
| 88 |
+
|
| 89 |
+
4、与模型配套的xxx.json,需有speaker项——人物列表
|
| 90 |
+
|
| 91 |
+
5、放无伴奏的音频、或网页内置录音,不要放奇奇怪怪的格式
|
| 92 |
+
|
| 93 |
+
6、仅供交流使用,不对用户行为负责
|
| 94 |
+
|
| 95 |
+
7、268000为44100预模型,配合sovits_pre.json;50000为22050预模型,配合nyarumul.json
|
| 96 |
|
| 97 |
+
""")
|
| 98 |
app.launch()
|
attentions.py
CHANGED
|
@@ -1,303 +1,311 @@
|
|
| 1 |
-
import copy
|
| 2 |
import math
|
| 3 |
-
|
| 4 |
import torch
|
| 5 |
from torch import nn
|
| 6 |
-
from torch.nn import functional as
|
| 7 |
|
| 8 |
import commons
|
| 9 |
-
import modules
|
| 10 |
from modules import LayerNorm
|
| 11 |
-
|
| 12 |
|
| 13 |
class Encoder(nn.Module):
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
class Decoder(nn.Module):
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
class MultiHeadAttention(nn.Module):
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
|
|
|
| 255 |
|
| 256 |
|
| 257 |
class FFN(nn.Module):
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import math
|
| 2 |
+
|
| 3 |
import torch
|
| 4 |
from torch import nn
|
| 5 |
+
from torch.nn import functional as t_func
|
| 6 |
|
| 7 |
import commons
|
|
|
|
| 8 |
from modules import LayerNorm
|
| 9 |
+
|
| 10 |
|
| 11 |
class Encoder(nn.Module):
|
| 12 |
+
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4,
|
| 13 |
+
**kwargs):
|
| 14 |
+
super().__init__()
|
| 15 |
+
self.hidden_channels = hidden_channels
|
| 16 |
+
self.filter_channels = filter_channels
|
| 17 |
+
self.n_heads = n_heads
|
| 18 |
+
self.n_layers = n_layers
|
| 19 |
+
self.kernel_size = kernel_size
|
| 20 |
+
self.p_dropout = p_dropout
|
| 21 |
+
self.window_size = window_size
|
| 22 |
+
|
| 23 |
+
self.drop = nn.Dropout(p_dropout)
|
| 24 |
+
self.attn_layers = nn.ModuleList()
|
| 25 |
+
self.norm_layers_1 = nn.ModuleList()
|
| 26 |
+
self.ffn_layers = nn.ModuleList()
|
| 27 |
+
self.norm_layers_2 = nn.ModuleList()
|
| 28 |
+
for i in range(self.n_layers):
|
| 29 |
+
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
|
| 30 |
+
window_size=window_size))
|
| 31 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
| 32 |
+
self.ffn_layers.append(
|
| 33 |
+
FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
|
| 34 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
| 35 |
+
|
| 36 |
+
def forward(self, x, x_mask):
|
| 37 |
+
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
| 38 |
+
x = x * x_mask
|
| 39 |
+
for i in range(self.n_layers):
|
| 40 |
+
y = self.attn_layers[i](x, x, attn_mask)
|
| 41 |
+
y = self.drop(y)
|
| 42 |
+
x = self.norm_layers_1[i](x + y)
|
| 43 |
+
|
| 44 |
+
y = self.ffn_layers[i](x, x_mask)
|
| 45 |
+
y = self.drop(y)
|
| 46 |
+
x = self.norm_layers_2[i](x + y)
|
| 47 |
+
x = x * x_mask
|
| 48 |
+
return x
|
| 49 |
|
| 50 |
|
| 51 |
class Decoder(nn.Module):
|
| 52 |
+
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
|
| 53 |
+
proximal_bias=False, proximal_init=True, **kwargs):
|
| 54 |
+
super().__init__()
|
| 55 |
+
self.hidden_channels = hidden_channels
|
| 56 |
+
self.filter_channels = filter_channels
|
| 57 |
+
self.n_heads = n_heads
|
| 58 |
+
self.n_layers = n_layers
|
| 59 |
+
self.kernel_size = kernel_size
|
| 60 |
+
self.p_dropout = p_dropout
|
| 61 |
+
self.proximal_bias = proximal_bias
|
| 62 |
+
self.proximal_init = proximal_init
|
| 63 |
+
|
| 64 |
+
self.drop = nn.Dropout(p_dropout)
|
| 65 |
+
self.self_attn_layers = nn.ModuleList()
|
| 66 |
+
self.norm_layers_0 = nn.ModuleList()
|
| 67 |
+
self.encdec_attn_layers = nn.ModuleList()
|
| 68 |
+
self.norm_layers_1 = nn.ModuleList()
|
| 69 |
+
self.ffn_layers = nn.ModuleList()
|
| 70 |
+
self.norm_layers_2 = nn.ModuleList()
|
| 71 |
+
for i in range(self.n_layers):
|
| 72 |
+
self.self_attn_layers.append(
|
| 73 |
+
MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
|
| 74 |
+
proximal_bias=proximal_bias, proximal_init=proximal_init))
|
| 75 |
+
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
| 76 |
+
self.encdec_attn_layers.append(
|
| 77 |
+
MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
|
| 78 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
| 79 |
+
self.ffn_layers.append(
|
| 80 |
+
FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
| 81 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
| 82 |
+
|
| 83 |
+
def forward(self, x, x_mask, h, h_mask):
|
| 84 |
+
"""
|
| 85 |
+
x: decoder input
|
| 86 |
+
h: encoder output
|
| 87 |
+
"""
|
| 88 |
+
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
| 89 |
+
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
| 90 |
+
x = x * x_mask
|
| 91 |
+
for i in range(self.n_layers):
|
| 92 |
+
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
| 93 |
+
y = self.drop(y)
|
| 94 |
+
x = self.norm_layers_0[i](x + y)
|
| 95 |
+
|
| 96 |
+
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
| 97 |
+
y = self.drop(y)
|
| 98 |
+
x = self.norm_layers_1[i](x + y)
|
| 99 |
+
|
| 100 |
+
y = self.ffn_layers[i](x, x_mask)
|
| 101 |
+
y = self.drop(y)
|
| 102 |
+
x = self.norm_layers_2[i](x + y)
|
| 103 |
+
x = x * x_mask
|
| 104 |
+
return x
|
| 105 |
|
| 106 |
|
| 107 |
class MultiHeadAttention(nn.Module):
|
| 108 |
+
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True,
|
| 109 |
+
block_length=None, proximal_bias=False, proximal_init=False):
|
| 110 |
+
super().__init__()
|
| 111 |
+
assert channels % n_heads == 0
|
| 112 |
+
|
| 113 |
+
self.channels = channels
|
| 114 |
+
self.out_channels = out_channels
|
| 115 |
+
self.n_heads = n_heads
|
| 116 |
+
self.p_dropout = p_dropout
|
| 117 |
+
self.window_size = window_size
|
| 118 |
+
self.heads_share = heads_share
|
| 119 |
+
self.block_length = block_length
|
| 120 |
+
self.proximal_bias = proximal_bias
|
| 121 |
+
self.proximal_init = proximal_init
|
| 122 |
+
self.attn = None
|
| 123 |
+
|
| 124 |
+
self.k_channels = channels // n_heads
|
| 125 |
+
self.conv_q = nn.Conv1d(channels, channels, 1)
|
| 126 |
+
self.conv_k = nn.Conv1d(channels, channels, 1)
|
| 127 |
+
self.conv_v = nn.Conv1d(channels, channels, 1)
|
| 128 |
+
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
| 129 |
+
self.drop = nn.Dropout(p_dropout)
|
| 130 |
+
|
| 131 |
+
if window_size is not None:
|
| 132 |
+
n_heads_rel = 1 if heads_share else n_heads
|
| 133 |
+
rel_stddev = self.k_channels ** -0.5
|
| 134 |
+
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
| 135 |
+
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
| 136 |
+
|
| 137 |
+
nn.init.xavier_uniform_(self.conv_q.weight)
|
| 138 |
+
nn.init.xavier_uniform_(self.conv_k.weight)
|
| 139 |
+
nn.init.xavier_uniform_(self.conv_v.weight)
|
| 140 |
+
if proximal_init:
|
| 141 |
+
with torch.no_grad():
|
| 142 |
+
self.conv_k.weight.copy_(self.conv_q.weight)
|
| 143 |
+
self.conv_k.bias.copy_(self.conv_q.bias)
|
| 144 |
+
|
| 145 |
+
def forward(self, x, c, attn_mask=None):
|
| 146 |
+
q = self.conv_q(x)
|
| 147 |
+
k = self.conv_k(c)
|
| 148 |
+
v = self.conv_v(c)
|
| 149 |
+
|
| 150 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
| 151 |
+
|
| 152 |
+
x = self.conv_o(x)
|
| 153 |
+
return x
|
| 154 |
+
|
| 155 |
+
def attention(self, query, key, value, mask=None):
|
| 156 |
+
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
| 157 |
+
b, d, t_s, t_t = (*key.size(), query.size(2))
|
| 158 |
+
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
| 159 |
+
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
| 160 |
+
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
| 161 |
+
|
| 162 |
+
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
| 163 |
+
if self.window_size is not None:
|
| 164 |
+
assert t_s == t_t, "Relative attention is only available for self-attention."
|
| 165 |
+
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
| 166 |
+
rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
|
| 167 |
+
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
| 168 |
+
scores = scores + scores_local
|
| 169 |
+
if self.proximal_bias:
|
| 170 |
+
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
| 171 |
+
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
|
| 172 |
+
if mask is not None:
|
| 173 |
+
scores = scores.masked_fill(mask == 0, -1e4)
|
| 174 |
+
if self.block_length is not None:
|
| 175 |
+
assert t_s == t_t, "Local attention is only available for self-attention."
|
| 176 |
+
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
|
| 177 |
+
scores = scores.masked_fill(block_mask == 0, -1e4)
|
| 178 |
+
p_attn = t_func.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
| 179 |
+
p_attn = self.drop(p_attn)
|
| 180 |
+
output = torch.matmul(p_attn, value)
|
| 181 |
+
if self.window_size is not None:
|
| 182 |
+
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
| 183 |
+
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
|
| 184 |
+
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
|
| 185 |
+
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
| 186 |
+
return output, p_attn
|
| 187 |
+
|
| 188 |
+
def _matmul_with_relative_values(self, x, y):
|
| 189 |
+
"""
|
| 190 |
+
x: [b, h, l, m]
|
| 191 |
+
y: [h or 1, m, d]
|
| 192 |
+
ret: [b, h, l, d]
|
| 193 |
+
"""
|
| 194 |
+
ret = torch.matmul(x, y.unsqueeze(0))
|
| 195 |
+
return ret
|
| 196 |
+
|
| 197 |
+
def _matmul_with_relative_keys(self, x, y):
|
| 198 |
+
"""
|
| 199 |
+
x: [b, h, l, d]
|
| 200 |
+
y: [h or 1, m, d]
|
| 201 |
+
ret: [b, h, l, m]
|
| 202 |
+
"""
|
| 203 |
+
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
| 204 |
+
return ret
|
| 205 |
+
|
| 206 |
+
def _get_relative_embeddings(self, relative_embeddings, length):
|
| 207 |
+
max_relative_position = 2 * self.window_size + 1
|
| 208 |
+
# Pad first before slice to avoid using cond ops.
|
| 209 |
+
pad_length = max(length - (self.window_size + 1), 0)
|
| 210 |
+
slice_start_position = max((self.window_size + 1) - length, 0)
|
| 211 |
+
slice_end_position = slice_start_position + 2 * length - 1
|
| 212 |
+
if pad_length > 0:
|
| 213 |
+
padded_relative_embeddings = t_func.pad(
|
| 214 |
+
relative_embeddings,
|
| 215 |
+
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
|
| 216 |
+
else:
|
| 217 |
+
padded_relative_embeddings = relative_embeddings
|
| 218 |
+
used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
|
| 219 |
+
return used_relative_embeddings
|
| 220 |
+
|
| 221 |
+
def _relative_position_to_absolute_position(self, x):
|
| 222 |
+
"""
|
| 223 |
+
x: [b, h, l, 2*l-1]
|
| 224 |
+
ret: [b, h, l, l]
|
| 225 |
+
"""
|
| 226 |
+
batch, heads, length, _ = x.size()
|
| 227 |
+
# Concat columns of pad to shift from relative to absolute indexing.
|
| 228 |
+
x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
|
| 229 |
+
|
| 230 |
+
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
| 231 |
+
x_flat = x.view([batch, heads, length * 2 * length])
|
| 232 |
+
x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
|
| 233 |
+
|
| 234 |
+
# Reshape and slice out the padded elements.
|
| 235 |
+
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
|
| 236 |
+
return x_final
|
| 237 |
+
|
| 238 |
+
def _absolute_position_to_relative_position(self, x):
|
| 239 |
+
"""
|
| 240 |
+
x: [b, h, l, l]
|
| 241 |
+
ret: [b, h, l, 2*l-1]
|
| 242 |
+
"""
|
| 243 |
+
batch, heads, length, _ = x.size()
|
| 244 |
+
# padd along column
|
| 245 |
+
x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
|
| 246 |
+
x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
|
| 247 |
+
# add 0's in the beginning that will skew the elements after reshape
|
| 248 |
+
x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
| 249 |
+
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
|
| 250 |
+
return x_final
|
| 251 |
+
|
| 252 |
+
def _attention_bias_proximal(self, length):
|
| 253 |
+
"""Bias for self-attention to encourage attention to close positions.
|
| 254 |
+
Args:
|
| 255 |
+
length: an integer scalar.
|
| 256 |
+
Returns:
|
| 257 |
+
a Tensor with shape [1, 1, length, length]
|
| 258 |
+
"""
|
| 259 |
+
r = torch.arange(length, dtype=torch.float32)
|
| 260 |
+
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
| 261 |
+
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
| 262 |
|
| 263 |
|
| 264 |
class FFN(nn.Module):
|
| 265 |
+
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None,
|
| 266 |
+
causal=False):
|
| 267 |
+
super().__init__()
|
| 268 |
+
self.in_channels = in_channels
|
| 269 |
+
self.out_channels = out_channels
|
| 270 |
+
self.filter_channels = filter_channels
|
| 271 |
+
self.kernel_size = kernel_size
|
| 272 |
+
self.p_dropout = p_dropout
|
| 273 |
+
self.activation = activation
|
| 274 |
+
self.causal = causal
|
| 275 |
+
|
| 276 |
+
if causal:
|
| 277 |
+
self.padding = self._causal_padding
|
| 278 |
+
else:
|
| 279 |
+
self.padding = self._same_padding
|
| 280 |
+
|
| 281 |
+
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
| 282 |
+
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
| 283 |
+
self.drop = nn.Dropout(p_dropout)
|
| 284 |
+
|
| 285 |
+
def forward(self, x, x_mask):
|
| 286 |
+
x = self.conv_1(self.padding(x * x_mask))
|
| 287 |
+
if self.activation == "gelu":
|
| 288 |
+
x = x * torch.sigmoid(1.702 * x)
|
| 289 |
+
else:
|
| 290 |
+
x = torch.relu(x)
|
| 291 |
+
x = self.drop(x)
|
| 292 |
+
x = self.conv_2(self.padding(x * x_mask))
|
| 293 |
+
return x * x_mask
|
| 294 |
+
|
| 295 |
+
def _causal_padding(self, x):
|
| 296 |
+
if self.kernel_size == 1:
|
| 297 |
+
return x
|
| 298 |
+
pad_l = self.kernel_size - 1
|
| 299 |
+
pad_r = 0
|
| 300 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
| 301 |
+
x = t_func.pad(x, commons.convert_pad_shape(padding))
|
| 302 |
+
return x
|
| 303 |
+
|
| 304 |
+
def _same_padding(self, x):
|
| 305 |
+
if self.kernel_size == 1:
|
| 306 |
+
return x
|
| 307 |
+
pad_l = (self.kernel_size - 1) // 2
|
| 308 |
+
pad_r = self.kernel_size // 2
|
| 309 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
| 310 |
+
x = t_func.pad(x, commons.convert_pad_shape(padding))
|
| 311 |
+
return x
|
commons.py
CHANGED
|
@@ -1,161 +1,160 @@
|
|
| 1 |
import math
|
| 2 |
-
|
| 3 |
import torch
|
| 4 |
-
from torch import
|
| 5 |
-
from torch.nn import functional as F
|
| 6 |
|
| 7 |
|
| 8 |
def init_weights(m, mean=0.0, std=0.01):
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
|
| 13 |
|
| 14 |
def get_padding(kernel_size, dilation=1):
|
| 15 |
-
|
| 16 |
|
| 17 |
|
| 18 |
def convert_pad_shape(pad_shape):
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
|
| 23 |
|
| 24 |
def intersperse(lst, item):
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
|
| 30 |
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
|
| 36 |
|
| 37 |
def rand_gumbel(shape):
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
|
| 43 |
def rand_gumbel_like(x):
|
| 44 |
-
|
| 45 |
-
|
| 46 |
|
| 47 |
|
| 48 |
def slice_segments(x, ids_str, segment_size=4):
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
|
| 56 |
|
| 57 |
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
|
| 67 |
def get_timing_signal_1d(
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
|
| 82 |
|
| 83 |
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
|
| 88 |
|
| 89 |
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
|
| 94 |
|
| 95 |
def subsequent_mask(length):
|
| 96 |
-
|
| 97 |
-
|
| 98 |
|
| 99 |
|
| 100 |
@torch.jit.script
|
| 101 |
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
|
| 109 |
|
| 110 |
def convert_pad_shape(pad_shape):
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
|
| 115 |
|
| 116 |
def shift_1d(x):
|
| 117 |
-
|
| 118 |
-
|
| 119 |
|
| 120 |
|
| 121 |
def sequence_mask(length, max_length=None):
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
|
| 127 |
|
| 128 |
def generate_path(duration, mask):
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
|
| 145 |
|
| 146 |
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
if clip_value is not None:
|
| 152 |
-
clip_value = float(clip_value)
|
| 153 |
-
|
| 154 |
-
total_norm = 0
|
| 155 |
-
for p in parameters:
|
| 156 |
-
param_norm = p.grad.data.norm(norm_type)
|
| 157 |
-
total_norm += param_norm.item() ** norm_type
|
| 158 |
if clip_value is not None:
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import math
|
| 2 |
+
|
| 3 |
import torch
|
| 4 |
+
from torch.nn import functional as t_func
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def init_weights(m, mean=0.0, std=0.01):
|
| 8 |
+
classname = m.__class__.__name__
|
| 9 |
+
if classname.find("Conv") != -1:
|
| 10 |
+
m.weight.data.normal_(mean, std)
|
| 11 |
|
| 12 |
|
| 13 |
def get_padding(kernel_size, dilation=1):
|
| 14 |
+
return int((kernel_size * dilation - dilation) / 2)
|
| 15 |
|
| 16 |
|
| 17 |
def convert_pad_shape(pad_shape):
|
| 18 |
+
l = pad_shape[::-1]
|
| 19 |
+
pad_shape = [item for sublist in l for item in sublist]
|
| 20 |
+
return pad_shape
|
| 21 |
|
| 22 |
|
| 23 |
def intersperse(lst, item):
|
| 24 |
+
result = [item] * (len(lst) * 2 + 1)
|
| 25 |
+
result[1::2] = lst
|
| 26 |
+
return result
|
| 27 |
|
| 28 |
|
| 29 |
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
| 30 |
+
"""KL(P||Q)"""
|
| 31 |
+
kl = (logs_q - logs_p) - 0.5
|
| 32 |
+
kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2. * logs_q)
|
| 33 |
+
return kl
|
| 34 |
|
| 35 |
|
| 36 |
def rand_gumbel(shape):
|
| 37 |
+
"""Sample from the Gumbel distribution, protect from overflows."""
|
| 38 |
+
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
| 39 |
+
return -torch.log(-torch.log(uniform_samples))
|
| 40 |
|
| 41 |
|
| 42 |
def rand_gumbel_like(x):
|
| 43 |
+
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
| 44 |
+
return g
|
| 45 |
|
| 46 |
|
| 47 |
def slice_segments(x, ids_str, segment_size=4):
|
| 48 |
+
ret = torch.zeros_like(x[:, :, :segment_size])
|
| 49 |
+
for i in range(x.size(0)):
|
| 50 |
+
idx_str = ids_str[i]
|
| 51 |
+
idx_end = idx_str + segment_size
|
| 52 |
+
ret[i] = x[i, :, idx_str:idx_end]
|
| 53 |
+
return ret
|
| 54 |
|
| 55 |
|
| 56 |
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
| 57 |
+
b, d, t = x.size()
|
| 58 |
+
if x_lengths is None:
|
| 59 |
+
x_lengths = t
|
| 60 |
+
ids_str_max = x_lengths - segment_size + 1
|
| 61 |
+
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
| 62 |
+
ret = slice_segments(x, ids_str, segment_size)
|
| 63 |
+
return ret, ids_str
|
| 64 |
|
| 65 |
|
| 66 |
def get_timing_signal_1d(
|
| 67 |
+
length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
| 68 |
+
position = torch.arange(length, dtype=torch.float)
|
| 69 |
+
num_timescales = channels // 2
|
| 70 |
+
log_timescale_increment = (
|
| 71 |
+
math.log(float(max_timescale) / float(min_timescale)) /
|
| 72 |
+
(num_timescales - 1))
|
| 73 |
+
inv_timescales = min_timescale * torch.exp(
|
| 74 |
+
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
|
| 75 |
+
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
| 76 |
+
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
| 77 |
+
signal = t_func.pad(signal, [0, 0, 0, channels % 2])
|
| 78 |
+
signal = signal.view(1, channels, length)
|
| 79 |
+
return signal
|
| 80 |
|
| 81 |
|
| 82 |
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
| 83 |
+
b, channels, length = x.size()
|
| 84 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
| 85 |
+
return x + signal.to(dtype=x.dtype, device=x.device)
|
| 86 |
|
| 87 |
|
| 88 |
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
| 89 |
+
b, channels, length = x.size()
|
| 90 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
| 91 |
+
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
| 92 |
|
| 93 |
|
| 94 |
def subsequent_mask(length):
|
| 95 |
+
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
| 96 |
+
return mask
|
| 97 |
|
| 98 |
|
| 99 |
@torch.jit.script
|
| 100 |
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
| 101 |
+
n_channels_int = n_channels[0]
|
| 102 |
+
in_act = input_a + input_b
|
| 103 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
| 104 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
| 105 |
+
acts = t_act * s_act
|
| 106 |
+
return acts
|
| 107 |
|
| 108 |
|
| 109 |
def convert_pad_shape(pad_shape):
|
| 110 |
+
l = pad_shape[::-1]
|
| 111 |
+
pad_shape = [item for sublist in l for item in sublist]
|
| 112 |
+
return pad_shape
|
| 113 |
|
| 114 |
|
| 115 |
def shift_1d(x):
|
| 116 |
+
x = t_func.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
| 117 |
+
return x
|
| 118 |
|
| 119 |
|
| 120 |
def sequence_mask(length, max_length=None):
|
| 121 |
+
if max_length is None:
|
| 122 |
+
max_length = length.max()
|
| 123 |
+
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
| 124 |
+
return x.unsqueeze(0) < length.unsqueeze(1)
|
| 125 |
|
| 126 |
|
| 127 |
def generate_path(duration, mask):
|
| 128 |
+
"""
|
| 129 |
+
duration: [b, 1, t_x]
|
| 130 |
+
mask: [b, 1, t_y, t_x]
|
| 131 |
+
"""
|
| 132 |
+
device = duration.device
|
| 133 |
+
|
| 134 |
+
b, _, t_y, t_x = mask.shape
|
| 135 |
+
cum_duration = torch.cumsum(duration, -1)
|
| 136 |
+
|
| 137 |
+
cum_duration_flat = cum_duration.view(b * t_x)
|
| 138 |
+
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
| 139 |
+
path = path.view(b, t_x, t_y)
|
| 140 |
+
path = path - t_func.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
| 141 |
+
path = path.unsqueeze(1).transpose(2, 3) * mask
|
| 142 |
+
return path
|
| 143 |
|
| 144 |
|
| 145 |
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
| 146 |
+
if isinstance(parameters, torch.Tensor):
|
| 147 |
+
parameters = [parameters]
|
| 148 |
+
parameters = list(filter(lambda para: para.grad is not None, parameters))
|
| 149 |
+
norm_type = float(norm_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
if clip_value is not None:
|
| 151 |
+
clip_value = float(clip_value)
|
| 152 |
+
|
| 153 |
+
total_norm = 0
|
| 154 |
+
for p in parameters:
|
| 155 |
+
param_norm = p.grad.data.norm(norm_type)
|
| 156 |
+
total_norm += param_norm.item() ** norm_type
|
| 157 |
+
if clip_value is not None:
|
| 158 |
+
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
| 159 |
+
total_norm = total_norm ** (1. / norm_type)
|
| 160 |
+
return total_norm
|
configs/yilanqiu.json → config.json
RENAMED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"train": {
|
| 3 |
"log_interval": 200,
|
| 4 |
-
"eval_interval":
|
| 5 |
"seed": 1234,
|
| 6 |
"epochs": 10000,
|
| 7 |
"learning_rate": 2e-4,
|
|
@@ -10,7 +10,7 @@
|
|
| 10 |
0.99
|
| 11 |
],
|
| 12 |
"eps": 1e-9,
|
| 13 |
-
"batch_size":
|
| 14 |
"fp16_run": true,
|
| 15 |
"lr_decay": 0.999875,
|
| 16 |
"segment_size": 8192,
|
|
@@ -20,8 +20,8 @@
|
|
| 20 |
"c_kl": 1.0
|
| 21 |
},
|
| 22 |
"data": {
|
| 23 |
-
"training_files": "/
|
| 24 |
-
"validation_files": "/
|
| 25 |
"text_cleaners": [
|
| 26 |
"english_cleaners2"
|
| 27 |
],
|
|
@@ -34,10 +34,10 @@
|
|
| 34 |
"mel_fmin": 0.0,
|
| 35 |
"mel_fmax": null,
|
| 36 |
"add_blank": true,
|
| 37 |
-
"n_speakers":
|
| 38 |
-
"cleaned_text": true
|
| 39 |
},
|
| 40 |
"model": {
|
|
|
|
| 41 |
"inter_channels": 192,
|
| 42 |
"hidden_channels": 256,
|
| 43 |
"filter_channels": 768,
|
|
@@ -86,8 +86,13 @@
|
|
| 86 |
"gin_channels": 256
|
| 87 |
},
|
| 88 |
"speakers": [
|
| 89 |
-
"
|
| 90 |
-
"
|
| 91 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
]
|
| 93 |
-
}
|
|
|
|
| 1 |
{
|
| 2 |
"train": {
|
| 3 |
"log_interval": 200,
|
| 4 |
+
"eval_interval": 5000,
|
| 5 |
"seed": 1234,
|
| 6 |
"epochs": 10000,
|
| 7 |
"learning_rate": 2e-4,
|
|
|
|
| 10 |
0.99
|
| 11 |
],
|
| 12 |
"eps": 1e-9,
|
| 13 |
+
"batch_size": 32,
|
| 14 |
"fp16_run": true,
|
| 15 |
"lr_decay": 0.999875,
|
| 16 |
"segment_size": 8192,
|
|
|
|
| 20 |
"c_kl": 1.0
|
| 21 |
},
|
| 22 |
"data": {
|
| 23 |
+
"training_files": "./filelist/train.txt",
|
| 24 |
+
"validation_files": "./filelist/val.txt",
|
| 25 |
"text_cleaners": [
|
| 26 |
"english_cleaners2"
|
| 27 |
],
|
|
|
|
| 34 |
"mel_fmin": 0.0,
|
| 35 |
"mel_fmax": null,
|
| 36 |
"add_blank": true,
|
| 37 |
+
"n_speakers": 8
|
|
|
|
| 38 |
},
|
| 39 |
"model": {
|
| 40 |
+
"sampling_rate": 22050,
|
| 41 |
"inter_channels": 192,
|
| 42 |
"hidden_channels": 256,
|
| 43 |
"filter_channels": 768,
|
|
|
|
| 86 |
"gin_channels": 256
|
| 87 |
},
|
| 88 |
"speakers": [
|
| 89 |
+
"zhezhi",
|
| 90 |
+
"kuangsan",
|
| 91 |
+
"sisinai",
|
| 92 |
+
"qinli",
|
| 93 |
+
"xixian",
|
| 94 |
+
"yejushi",
|
| 95 |
+
"meijiu",
|
| 96 |
+
"shixiang"
|
| 97 |
]
|
| 98 |
+
}
|
configs/nyarumul.json
DELETED
|
@@ -1,53 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"train": {
|
| 3 |
-
"log_interval": 200,
|
| 4 |
-
"eval_interval": 2000,
|
| 5 |
-
"seed": 1234,
|
| 6 |
-
"epochs": 10000,
|
| 7 |
-
"learning_rate": 2e-4,
|
| 8 |
-
"betas": [0.8, 0.99],
|
| 9 |
-
"eps": 1e-9,
|
| 10 |
-
"batch_size": 16,
|
| 11 |
-
"fp16_run": true,
|
| 12 |
-
"lr_decay": 0.999875,
|
| 13 |
-
"segment_size": 8192,
|
| 14 |
-
"init_lr_ratio": 1,
|
| 15 |
-
"warmup_epochs": 0,
|
| 16 |
-
"c_mel": 45,
|
| 17 |
-
"c_kl": 1.0
|
| 18 |
-
},
|
| 19 |
-
"data": {
|
| 20 |
-
"training_files":"/content/drive/MyDrive/SingingVC/trainmul.txt",
|
| 21 |
-
"validation_files":"/content/drive/MyDrive/SingingVC/valmul.txt",
|
| 22 |
-
"text_cleaners":["english_cleaners2"],
|
| 23 |
-
"max_wav_value": 32768.0,
|
| 24 |
-
"sampling_rate": 22050,
|
| 25 |
-
"filter_length": 1024,
|
| 26 |
-
"hop_length": 256,
|
| 27 |
-
"win_length": 1024,
|
| 28 |
-
"n_mel_channels": 80,
|
| 29 |
-
"mel_fmin": 0.0,
|
| 30 |
-
"mel_fmax": null,
|
| 31 |
-
"add_blank": true,
|
| 32 |
-
"n_speakers": 3,
|
| 33 |
-
"cleaned_text": true
|
| 34 |
-
},
|
| 35 |
-
"model": {
|
| 36 |
-
"inter_channels": 192,
|
| 37 |
-
"hidden_channels": 256,
|
| 38 |
-
"filter_channels": 768,
|
| 39 |
-
"n_heads": 2,
|
| 40 |
-
"n_layers": 6,
|
| 41 |
-
"kernel_size": 3,
|
| 42 |
-
"p_dropout": 0.1,
|
| 43 |
-
"resblock": "1",
|
| 44 |
-
"resblock_kernel_sizes": [3,7,11],
|
| 45 |
-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
| 46 |
-
"upsample_rates": [8,8,2,2],
|
| 47 |
-
"upsample_initial_channel": 512,
|
| 48 |
-
"upsample_kernel_sizes": [16,16,4,4],
|
| 49 |
-
"n_layers_q": 3,
|
| 50 |
-
"use_spectral_norm": false,
|
| 51 |
-
"gin_channels": 256
|
| 52 |
-
}
|
| 53 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/nyarusing.json
DELETED
|
@@ -1,52 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"train": {
|
| 3 |
-
"log_interval": 200,
|
| 4 |
-
"eval_interval": 2000,
|
| 5 |
-
"seed": 1234,
|
| 6 |
-
"epochs": 20000,
|
| 7 |
-
"learning_rate": 2e-4,
|
| 8 |
-
"betas": [0.8, 0.99],
|
| 9 |
-
"eps": 1e-9,
|
| 10 |
-
"batch_size": 24,
|
| 11 |
-
"fp16_run": true,
|
| 12 |
-
"lr_decay": 0.999875,
|
| 13 |
-
"segment_size": 8192,
|
| 14 |
-
"init_lr_ratio": 1,
|
| 15 |
-
"warmup_epochs": 0,
|
| 16 |
-
"c_mel": 45,
|
| 17 |
-
"c_kl": 1.0
|
| 18 |
-
},
|
| 19 |
-
"data": {
|
| 20 |
-
"training_files":"/content/train.txt",
|
| 21 |
-
"validation_files":"/content/nyarusing/val.txt",
|
| 22 |
-
"text_cleaners":["english_cleaners2"],
|
| 23 |
-
"max_wav_value": 32768.0,
|
| 24 |
-
"sampling_rate": 22050,
|
| 25 |
-
"filter_length": 1024,
|
| 26 |
-
"hop_length": 256,
|
| 27 |
-
"win_length": 1024,
|
| 28 |
-
"n_mel_channels": 80,
|
| 29 |
-
"mel_fmin": 0.0,
|
| 30 |
-
"mel_fmax": null,
|
| 31 |
-
"add_blank": true,
|
| 32 |
-
"n_speakers": 0,
|
| 33 |
-
"cleaned_text": true
|
| 34 |
-
},
|
| 35 |
-
"model": {
|
| 36 |
-
"inter_channels": 192,
|
| 37 |
-
"hidden_channels": 256,
|
| 38 |
-
"filter_channels": 768,
|
| 39 |
-
"n_heads": 2,
|
| 40 |
-
"n_layers": 6,
|
| 41 |
-
"kernel_size": 3,
|
| 42 |
-
"p_dropout": 0.1,
|
| 43 |
-
"resblock": "1",
|
| 44 |
-
"resblock_kernel_sizes": [3,7,11],
|
| 45 |
-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
| 46 |
-
"upsample_rates": [8,8,2,2],
|
| 47 |
-
"upsample_initial_channel": 512,
|
| 48 |
-
"upsample_kernel_sizes": [16,16,4,4],
|
| 49 |
-
"n_layers_q": 3,
|
| 50 |
-
"use_spectral_norm": false
|
| 51 |
-
}
|
| 52 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import numpy as np
|
| 3 |
-
import icassp2022_vocal_transcription
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def resize2d(source, target_len):
|
| 7 |
-
source = source.astype(float)
|
| 8 |
-
source[source < 0.001] = np.nan
|
| 9 |
-
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
|
| 10 |
-
source)
|
| 11 |
-
res = np.nan_to_num(target)
|
| 12 |
-
ret = res[:].astype(int)
|
| 13 |
-
# 若调整大小时采样到中间的点,则以上一个点作为当前音高值
|
| 14 |
-
for i in range(len(res)):
|
| 15 |
-
if res[i] - ret[i] > 0.001:
|
| 16 |
-
ret[i] = ret[i - 1]
|
| 17 |
-
return ret
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def get_end_file(dir_path, end):
|
| 21 |
-
file_lists = []
|
| 22 |
-
for root, dirs, files in os.walk(dir_path):
|
| 23 |
-
for f_file in files:
|
| 24 |
-
if f_file.endswith(end):
|
| 25 |
-
file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
|
| 26 |
-
|
| 27 |
-
return file_lists
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
folder = "val"
|
| 31 |
-
wav_paths = get_end_file(f"./qiu/wavs/{folder}/", "wav")
|
| 32 |
-
for wav_path in wav_paths:
|
| 33 |
-
pitch = icassp2022_vocal_transcription.transcribe(wav_path)
|
| 34 |
-
soft = np.load(wav_path.replace("wavs", "soft").replace(".wav", ".npy"))
|
| 35 |
-
pitch = resize2d(pitch, len(soft[:, 0]))
|
| 36 |
-
np.save(wav_path.replace("wavs", "pitch").replace(".wav", ".npy"), pitch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_utils.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
-
import time
|
| 2 |
import os
|
| 3 |
import random
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
| 6 |
import torch.utils.data
|
| 7 |
-
import numpy as np
|
| 8 |
-
import commons
|
| 9 |
from mel_processing import spectrogram_torch
|
|
|
|
| 10 |
from utils import load_wav_to_torch, load_filepaths_and_text
|
| 11 |
-
from text import text_to_sequence, cleaned_text_to_sequence
|
| 12 |
|
| 13 |
|
| 14 |
def dropout1d(myarray, ratio=0.5):
|
|
@@ -59,11 +57,11 @@ class TextAudioLoader(torch.utils.data.Dataset):
|
|
| 59 |
|
| 60 |
def get_audio_text_pair(self, audiopath_and_text):
|
| 61 |
# separate filename and text
|
| 62 |
-
audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1],audiopath_and_text[2]
|
| 63 |
text = self.get_text(text)
|
| 64 |
spec, wav = self.get_audio(audiopath)
|
| 65 |
pitch = self.get_pitch(pitch)
|
| 66 |
-
return
|
| 67 |
|
| 68 |
def get_pitch(self, pitch):
|
| 69 |
|
|
@@ -99,7 +97,7 @@ class TextAudioLoader(torch.utils.data.Dataset):
|
|
| 99 |
return len(self.audiopaths_and_text)
|
| 100 |
|
| 101 |
|
| 102 |
-
class TextAudioCollate
|
| 103 |
""" Zero-pads model inputs and targets
|
| 104 |
"""
|
| 105 |
|
|
@@ -123,7 +121,6 @@ class TextAudioCollate():
|
|
| 123 |
max_pitch_len = max([x[3].shape[0] for x in batch])
|
| 124 |
# print(batch)
|
| 125 |
|
| 126 |
-
|
| 127 |
text_lengths = torch.LongTensor(len(batch))
|
| 128 |
spec_lengths = torch.LongTensor(len(batch))
|
| 129 |
wav_lengths = torch.LongTensor(len(batch))
|
|
@@ -205,13 +202,14 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
| 205 |
|
| 206 |
def get_audio_text_speaker_pair(self, audiopath_sid_text):
|
| 207 |
# separate filename, speaker_id and text
|
| 208 |
-
audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2],
|
|
|
|
| 209 |
text = self.get_text(text)
|
| 210 |
spec, wav = self.get_audio(audiopath)
|
| 211 |
sid = self.get_sid(sid)
|
| 212 |
pitch = self.get_pitch(pitch)
|
| 213 |
|
| 214 |
-
return
|
| 215 |
|
| 216 |
def get_audio(self, filename):
|
| 217 |
audio, sampling_rate = load_wav_to_torch(filename)
|
|
@@ -235,7 +233,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
| 235 |
soft = np.load(text)
|
| 236 |
text_norm = torch.FloatTensor(soft)
|
| 237 |
return text_norm
|
| 238 |
-
|
| 239 |
def get_pitch(self, pitch):
|
| 240 |
return torch.LongTensor(np.load(pitch))
|
| 241 |
|
|
@@ -250,7 +248,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
| 250 |
return len(self.audiopaths_sid_text)
|
| 251 |
|
| 252 |
|
| 253 |
-
class TextAudioSpeakerCollate
|
| 254 |
""" Zero-pads model inputs and targets
|
| 255 |
"""
|
| 256 |
|
|
@@ -310,7 +308,7 @@ class TextAudioSpeakerCollate():
|
|
| 310 |
|
| 311 |
if self.return_ids:
|
| 312 |
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing
|
| 313 |
-
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths,pitch_padded
|
| 314 |
|
| 315 |
|
| 316 |
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
|
@@ -400,7 +398,7 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
|
| 400 |
|
| 401 |
if hi > lo:
|
| 402 |
mid = (hi + lo) // 2
|
| 403 |
-
if self.boundaries[mid] < x
|
| 404 |
return mid
|
| 405 |
elif x <= self.boundaries[mid]:
|
| 406 |
return self._bisect(x, lo, mid)
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import random
|
| 3 |
+
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
| 6 |
import torch.utils.data
|
|
|
|
|
|
|
| 7 |
from mel_processing import spectrogram_torch
|
| 8 |
+
|
| 9 |
from utils import load_wav_to_torch, load_filepaths_and_text
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def dropout1d(myarray, ratio=0.5):
|
|
|
|
| 57 |
|
| 58 |
def get_audio_text_pair(self, audiopath_and_text):
|
| 59 |
# separate filename and text
|
| 60 |
+
audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1], audiopath_and_text[2]
|
| 61 |
text = self.get_text(text)
|
| 62 |
spec, wav = self.get_audio(audiopath)
|
| 63 |
pitch = self.get_pitch(pitch)
|
| 64 |
+
return text, spec, wav, pitch
|
| 65 |
|
| 66 |
def get_pitch(self, pitch):
|
| 67 |
|
|
|
|
| 97 |
return len(self.audiopaths_and_text)
|
| 98 |
|
| 99 |
|
| 100 |
+
class TextAudioCollate:
|
| 101 |
""" Zero-pads model inputs and targets
|
| 102 |
"""
|
| 103 |
|
|
|
|
| 121 |
max_pitch_len = max([x[3].shape[0] for x in batch])
|
| 122 |
# print(batch)
|
| 123 |
|
|
|
|
| 124 |
text_lengths = torch.LongTensor(len(batch))
|
| 125 |
spec_lengths = torch.LongTensor(len(batch))
|
| 126 |
wav_lengths = torch.LongTensor(len(batch))
|
|
|
|
| 202 |
|
| 203 |
def get_audio_text_speaker_pair(self, audiopath_sid_text):
|
| 204 |
# separate filename, speaker_id and text
|
| 205 |
+
audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], \
|
| 206 |
+
audiopath_sid_text[3]
|
| 207 |
text = self.get_text(text)
|
| 208 |
spec, wav = self.get_audio(audiopath)
|
| 209 |
sid = self.get_sid(sid)
|
| 210 |
pitch = self.get_pitch(pitch)
|
| 211 |
|
| 212 |
+
return text, spec, wav, pitch, sid
|
| 213 |
|
| 214 |
def get_audio(self, filename):
|
| 215 |
audio, sampling_rate = load_wav_to_torch(filename)
|
|
|
|
| 233 |
soft = np.load(text)
|
| 234 |
text_norm = torch.FloatTensor(soft)
|
| 235 |
return text_norm
|
| 236 |
+
|
| 237 |
def get_pitch(self, pitch):
|
| 238 |
return torch.LongTensor(np.load(pitch))
|
| 239 |
|
|
|
|
| 248 |
return len(self.audiopaths_sid_text)
|
| 249 |
|
| 250 |
|
| 251 |
+
class TextAudioSpeakerCollate:
|
| 252 |
""" Zero-pads model inputs and targets
|
| 253 |
"""
|
| 254 |
|
|
|
|
| 308 |
|
| 309 |
if self.return_ids:
|
| 310 |
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing
|
| 311 |
+
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid
|
| 312 |
|
| 313 |
|
| 314 |
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
|
|
|
| 398 |
|
| 399 |
if hi > lo:
|
| 400 |
mid = (hi + lo) // 2
|
| 401 |
+
if self.boundaries[mid] < x <= self.boundaries[mid + 1]:
|
| 402 |
return mid
|
| 403 |
elif x <= self.boundaries[mid]:
|
| 404 |
return self._bisect(x, lo, mid)
|
hubert/__init__.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
from .model import (
|
| 2 |
-
Hubert,
|
| 3 |
-
HubertDiscrete,
|
| 4 |
-
HubertSoft,
|
| 5 |
-
hubert_discrete,
|
| 6 |
-
hubert_soft,
|
| 7 |
-
kmeans100,
|
| 8 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hubert/__pycache__/__init__.cpython-38.pyc
DELETED
|
Binary file (281 Bytes)
|
|
|
hubert/__pycache__/model.cpython-38.pyc
DELETED
|
Binary file (10 kB)
|
|
|
hubert/dataset.py
DELETED
|
@@ -1,91 +0,0 @@
|
|
| 1 |
-
import random
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
import numpy as np
|
| 4 |
-
import json
|
| 5 |
-
|
| 6 |
-
import torch
|
| 7 |
-
import torch.nn.functional as F
|
| 8 |
-
from torch.utils.data import Dataset
|
| 9 |
-
import torchaudio
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class AcousticUnitsDataset(Dataset):
|
| 13 |
-
def __init__(
|
| 14 |
-
self,
|
| 15 |
-
root: Path,
|
| 16 |
-
sample_rate: int = 16000,
|
| 17 |
-
label_rate: int = 50,
|
| 18 |
-
min_samples: int = 32000,
|
| 19 |
-
max_samples: int = 250000,
|
| 20 |
-
train: bool = True,
|
| 21 |
-
):
|
| 22 |
-
self.wavs_dir = root / "wavs"
|
| 23 |
-
self.units_dir = root / "units"
|
| 24 |
-
|
| 25 |
-
with open(root / "lengths.json") as file:
|
| 26 |
-
self.lenghts = json.load(file)
|
| 27 |
-
|
| 28 |
-
pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac"
|
| 29 |
-
metadata = (
|
| 30 |
-
(path, path.relative_to(self.wavs_dir).with_suffix("").as_posix())
|
| 31 |
-
for path in self.wavs_dir.rglob(pattern)
|
| 32 |
-
)
|
| 33 |
-
metadata = ((path, key) for path, key in metadata if key in self.lenghts)
|
| 34 |
-
self.metadata = [
|
| 35 |
-
path for path, key in metadata if self.lenghts[key] > min_samples
|
| 36 |
-
]
|
| 37 |
-
|
| 38 |
-
self.sample_rate = sample_rate
|
| 39 |
-
self.label_rate = label_rate
|
| 40 |
-
self.min_samples = min_samples
|
| 41 |
-
self.max_samples = max_samples
|
| 42 |
-
self.train = train
|
| 43 |
-
|
| 44 |
-
def __len__(self):
|
| 45 |
-
return len(self.metadata)
|
| 46 |
-
|
| 47 |
-
def __getitem__(self, index):
|
| 48 |
-
wav_path = self.metadata[index]
|
| 49 |
-
units_path = self.units_dir / wav_path.relative_to(self.wavs_dir)
|
| 50 |
-
|
| 51 |
-
wav, _ = torchaudio.load(wav_path)
|
| 52 |
-
wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
|
| 53 |
-
codes = np.load(units_path.with_suffix(".npy"))
|
| 54 |
-
|
| 55 |
-
return wav, torch.from_numpy(codes).long()
|
| 56 |
-
|
| 57 |
-
def collate(self, batch):
|
| 58 |
-
wavs, codes = zip(*batch)
|
| 59 |
-
wavs, codes = list(wavs), list(codes)
|
| 60 |
-
|
| 61 |
-
wav_lengths = [wav.size(-1) for wav in wavs]
|
| 62 |
-
code_lengths = [code.size(-1) for code in codes]
|
| 63 |
-
|
| 64 |
-
wav_frames = min(self.max_samples, *wav_lengths)
|
| 65 |
-
|
| 66 |
-
collated_wavs, wav_offsets = [], []
|
| 67 |
-
for wav in wavs:
|
| 68 |
-
wav_diff = wav.size(-1) - wav_frames
|
| 69 |
-
wav_offset = random.randint(0, wav_diff)
|
| 70 |
-
wav = wav[:, wav_offset : wav_offset + wav_frames]
|
| 71 |
-
|
| 72 |
-
collated_wavs.append(wav)
|
| 73 |
-
wav_offsets.append(wav_offset)
|
| 74 |
-
|
| 75 |
-
rate = self.label_rate / self.sample_rate
|
| 76 |
-
code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets]
|
| 77 |
-
code_frames = round(wav_frames * rate)
|
| 78 |
-
remaining_code_frames = [
|
| 79 |
-
length - offset for length, offset in zip(code_lengths, code_offsets)
|
| 80 |
-
]
|
| 81 |
-
code_frames = min(code_frames, *remaining_code_frames)
|
| 82 |
-
|
| 83 |
-
collated_codes = []
|
| 84 |
-
for code, code_offset in zip(codes, code_offsets):
|
| 85 |
-
code = code[code_offset : code_offset + code_frames]
|
| 86 |
-
collated_codes.append(code)
|
| 87 |
-
|
| 88 |
-
wavs = torch.stack(collated_wavs, dim=0)
|
| 89 |
-
codes = torch.stack(collated_codes, dim=0)
|
| 90 |
-
|
| 91 |
-
return wavs, codes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hubert/utils.py
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
class Metric:
|
| 5 |
-
def __init__(self):
|
| 6 |
-
self.steps = 0
|
| 7 |
-
self.value = 0
|
| 8 |
-
|
| 9 |
-
def update(self, value):
|
| 10 |
-
self.steps += 1
|
| 11 |
-
self.value += (value - self.value) / self.steps
|
| 12 |
-
return self.value
|
| 13 |
-
|
| 14 |
-
def reset(self):
|
| 15 |
-
self.steps = 0
|
| 16 |
-
self.value = 0
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def save_checkpoint(
|
| 20 |
-
checkpoint_dir,
|
| 21 |
-
hubert,
|
| 22 |
-
optimizer,
|
| 23 |
-
scaler,
|
| 24 |
-
step,
|
| 25 |
-
loss,
|
| 26 |
-
best,
|
| 27 |
-
logger,
|
| 28 |
-
):
|
| 29 |
-
state = {
|
| 30 |
-
"hubert": hubert.state_dict(),
|
| 31 |
-
"optimizer": optimizer.state_dict(),
|
| 32 |
-
"scaler": scaler.state_dict(),
|
| 33 |
-
"step": step,
|
| 34 |
-
"loss": loss,
|
| 35 |
-
}
|
| 36 |
-
checkpoint_dir.mkdir(exist_ok=True, parents=True)
|
| 37 |
-
checkpoint_path = checkpoint_dir / f"model-{step}.pt"
|
| 38 |
-
torch.save(state, checkpoint_path)
|
| 39 |
-
if best:
|
| 40 |
-
best_path = checkpoint_dir / "model-best.pt"
|
| 41 |
-
torch.save(state, best_path)
|
| 42 |
-
logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
def load_checkpoint(
|
| 46 |
-
load_path,
|
| 47 |
-
hubert,
|
| 48 |
-
optimizer,
|
| 49 |
-
scaler,
|
| 50 |
-
rank,
|
| 51 |
-
logger,
|
| 52 |
-
):
|
| 53 |
-
logger.info(f"Loading checkpoint from {load_path}")
|
| 54 |
-
checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
|
| 55 |
-
hubert.load_state_dict(checkpoint["hubert"])
|
| 56 |
-
scaler.load_state_dict(checkpoint["scaler"])
|
| 57 |
-
optimizer.load_state_dict(checkpoint["optimizer"])
|
| 58 |
-
return checkpoint["step"], checkpoint["loss"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hubert/model.py → hubert_model.py
RENAMED
|
@@ -1,20 +1,12 @@
|
|
| 1 |
import copy
|
| 2 |
-
from typing import Optional, Tuple
|
| 3 |
import random
|
| 4 |
-
|
| 5 |
-
from sklearn.cluster import KMeans
|
| 6 |
|
| 7 |
import torch
|
| 8 |
import torch.nn as nn
|
| 9 |
-
import torch.nn.functional as
|
| 10 |
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
|
| 11 |
|
| 12 |
-
URLS = {
|
| 13 |
-
"hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-discrete-e9416457.pt",
|
| 14 |
-
"hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt",
|
| 15 |
-
"kmeans100": "https://github.com/bshall/hubert/releases/download/v0.1/kmeans100-50f36a95.pt",
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
|
| 19 |
class Hubert(nn.Module):
|
| 20 |
def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
|
|
@@ -44,7 +36,7 @@ class Hubert(nn.Module):
|
|
| 44 |
return x, mask
|
| 45 |
|
| 46 |
def encode(
|
| 47 |
-
|
| 48 |
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 49 |
x = self.feature_extractor(x)
|
| 50 |
x = self.feature_projection(x.transpose(1, 2))
|
|
@@ -75,24 +67,11 @@ class HubertSoft(Hubert):
|
|
| 75 |
|
| 76 |
@torch.inference_mode()
|
| 77 |
def units(self, wav: torch.Tensor) -> torch.Tensor:
|
| 78 |
-
wav =
|
| 79 |
x, _ = self.encode(wav)
|
| 80 |
return self.proj(x)
|
| 81 |
|
| 82 |
|
| 83 |
-
class HubertDiscrete(Hubert):
|
| 84 |
-
def __init__(self, kmeans):
|
| 85 |
-
super().__init__(504)
|
| 86 |
-
self.kmeans = kmeans
|
| 87 |
-
|
| 88 |
-
@torch.inference_mode()
|
| 89 |
-
def units(self, wav: torch.Tensor) -> torch.LongTensor:
|
| 90 |
-
wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
|
| 91 |
-
x, _ = self.encode(wav, layer=7)
|
| 92 |
-
x = self.kmeans.predict(x.squeeze().cpu().numpy())
|
| 93 |
-
return torch.tensor(x, dtype=torch.long, device=wav.device)
|
| 94 |
-
|
| 95 |
-
|
| 96 |
class FeatureExtractor(nn.Module):
|
| 97 |
def __init__(self):
|
| 98 |
super().__init__()
|
|
@@ -106,13 +85,13 @@ class FeatureExtractor(nn.Module):
|
|
| 106 |
self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
|
| 107 |
|
| 108 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 109 |
-
x =
|
| 110 |
-
x =
|
| 111 |
-
x =
|
| 112 |
-
x =
|
| 113 |
-
x =
|
| 114 |
-
x =
|
| 115 |
-
x =
|
| 116 |
return x
|
| 117 |
|
| 118 |
|
|
@@ -144,13 +123,13 @@ class PositionalConvEmbedding(nn.Module):
|
|
| 144 |
|
| 145 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 146 |
x = self.conv(x.transpose(1, 2))
|
| 147 |
-
x =
|
| 148 |
return x.transpose(1, 2)
|
| 149 |
|
| 150 |
|
| 151 |
class TransformerEncoder(nn.Module):
|
| 152 |
def __init__(
|
| 153 |
-
|
| 154 |
) -> None:
|
| 155 |
super(TransformerEncoder, self).__init__()
|
| 156 |
self.layers = nn.ModuleList(
|
|
@@ -159,11 +138,11 @@ class TransformerEncoder(nn.Module):
|
|
| 159 |
self.num_layers = num_layers
|
| 160 |
|
| 161 |
def forward(
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
) -> torch.Tensor:
|
| 168 |
output = src
|
| 169 |
for layer in self.layers[:output_layer]:
|
|
@@ -174,11 +153,11 @@ class TransformerEncoder(nn.Module):
|
|
| 174 |
|
| 175 |
|
| 176 |
def _compute_mask(
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
) -> torch.Tensor:
|
| 183 |
batch_size, sequence_length = shape
|
| 184 |
|
|
@@ -228,62 +207,17 @@ def _compute_mask(
|
|
| 228 |
return mask
|
| 229 |
|
| 230 |
|
| 231 |
-
def hubert_discrete(
|
| 232 |
-
pretrained: bool = True,
|
| 233 |
-
progress: bool = True,
|
| 234 |
-
) -> HubertDiscrete:
|
| 235 |
-
r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
|
| 236 |
-
Args:
|
| 237 |
-
pretrained (bool): load pretrained weights into the model
|
| 238 |
-
progress (bool): show progress bar when downloading model
|
| 239 |
-
"""
|
| 240 |
-
kmeans = kmeans100(pretrained=pretrained, progress=progress)
|
| 241 |
-
hubert = HubertDiscrete(kmeans)
|
| 242 |
-
if pretrained:
|
| 243 |
-
checkpoint = torch.hub.load_state_dict_from_url(
|
| 244 |
-
URLS["hubert-discrete"], progress=progress
|
| 245 |
-
)
|
| 246 |
-
consume_prefix_in_state_dict_if_present(checkpoint, "module.")
|
| 247 |
-
hubert.load_state_dict(checkpoint)
|
| 248 |
-
hubert.eval()
|
| 249 |
-
return hubert
|
| 250 |
-
|
| 251 |
-
|
| 252 |
def hubert_soft(
|
| 253 |
-
|
| 254 |
) -> HubertSoft:
|
| 255 |
r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
|
| 256 |
Args:
|
| 257 |
path (str): path of a pretrained model
|
| 258 |
"""
|
| 259 |
-
dev = torch.device("
|
| 260 |
hubert = HubertSoft()
|
| 261 |
checkpoint = torch.load(path)
|
| 262 |
consume_prefix_in_state_dict_if_present(checkpoint, "module.")
|
| 263 |
hubert.load_state_dict(checkpoint)
|
| 264 |
hubert.eval().to(dev)
|
| 265 |
return hubert
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
def _kmeans(
|
| 269 |
-
num_clusters: int, pretrained: bool = True, progress: bool = True
|
| 270 |
-
) -> KMeans:
|
| 271 |
-
kmeans = KMeans(num_clusters)
|
| 272 |
-
if pretrained:
|
| 273 |
-
checkpoint = torch.hub.load_state_dict_from_url(
|
| 274 |
-
URLS[f"kmeans{num_clusters}"], progress=progress
|
| 275 |
-
)
|
| 276 |
-
kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
|
| 277 |
-
kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
|
| 278 |
-
kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
|
| 279 |
-
return kmeans
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
|
| 283 |
-
r"""
|
| 284 |
-
k-means checkpoint for HuBERT-Discrete with 100 clusters.
|
| 285 |
-
Args:
|
| 286 |
-
pretrained (bool): load pretrained weights into the model
|
| 287 |
-
progress (bool): show progress bar when downloading model
|
| 288 |
-
"""
|
| 289 |
-
return _kmeans(100, pretrained, progress)
|
|
|
|
| 1 |
import copy
|
|
|
|
| 2 |
import random
|
| 3 |
+
from typing import Optional, Tuple
|
|
|
|
| 4 |
|
| 5 |
import torch
|
| 6 |
import torch.nn as nn
|
| 7 |
+
import torch.nn.functional as t_func
|
| 8 |
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
class Hubert(nn.Module):
|
| 12 |
def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
|
|
|
|
| 36 |
return x, mask
|
| 37 |
|
| 38 |
def encode(
|
| 39 |
+
self, x: torch.Tensor, layer: Optional[int] = None
|
| 40 |
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 41 |
x = self.feature_extractor(x)
|
| 42 |
x = self.feature_projection(x.transpose(1, 2))
|
|
|
|
| 67 |
|
| 68 |
@torch.inference_mode()
|
| 69 |
def units(self, wav: torch.Tensor) -> torch.Tensor:
|
| 70 |
+
wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
|
| 71 |
x, _ = self.encode(wav)
|
| 72 |
return self.proj(x)
|
| 73 |
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
class FeatureExtractor(nn.Module):
|
| 76 |
def __init__(self):
|
| 77 |
super().__init__()
|
|
|
|
| 85 |
self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
|
| 86 |
|
| 87 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 88 |
+
x = t_func.gelu(self.norm0(self.conv0(x)))
|
| 89 |
+
x = t_func.gelu(self.conv1(x))
|
| 90 |
+
x = t_func.gelu(self.conv2(x))
|
| 91 |
+
x = t_func.gelu(self.conv3(x))
|
| 92 |
+
x = t_func.gelu(self.conv4(x))
|
| 93 |
+
x = t_func.gelu(self.conv5(x))
|
| 94 |
+
x = t_func.gelu(self.conv6(x))
|
| 95 |
return x
|
| 96 |
|
| 97 |
|
|
|
|
| 123 |
|
| 124 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 125 |
x = self.conv(x.transpose(1, 2))
|
| 126 |
+
x = t_func.gelu(x[:, :, :-1])
|
| 127 |
return x.transpose(1, 2)
|
| 128 |
|
| 129 |
|
| 130 |
class TransformerEncoder(nn.Module):
|
| 131 |
def __init__(
|
| 132 |
+
self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
|
| 133 |
) -> None:
|
| 134 |
super(TransformerEncoder, self).__init__()
|
| 135 |
self.layers = nn.ModuleList(
|
|
|
|
| 138 |
self.num_layers = num_layers
|
| 139 |
|
| 140 |
def forward(
|
| 141 |
+
self,
|
| 142 |
+
src: torch.Tensor,
|
| 143 |
+
mask: torch.Tensor = None,
|
| 144 |
+
src_key_padding_mask: torch.Tensor = None,
|
| 145 |
+
output_layer: Optional[int] = None,
|
| 146 |
) -> torch.Tensor:
|
| 147 |
output = src
|
| 148 |
for layer in self.layers[:output_layer]:
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
def _compute_mask(
|
| 156 |
+
shape: Tuple[int, int],
|
| 157 |
+
mask_prob: float,
|
| 158 |
+
mask_length: int,
|
| 159 |
+
device: torch.device,
|
| 160 |
+
min_masks: int = 0,
|
| 161 |
) -> torch.Tensor:
|
| 162 |
batch_size, sequence_length = shape
|
| 163 |
|
|
|
|
| 207 |
return mask
|
| 208 |
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
def hubert_soft(
|
| 211 |
+
path: str
|
| 212 |
) -> HubertSoft:
|
| 213 |
r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
|
| 214 |
Args:
|
| 215 |
path (str): path of a pretrained model
|
| 216 |
"""
|
| 217 |
+
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 218 |
hubert = HubertSoft()
|
| 219 |
checkpoint = torch.load(path)
|
| 220 |
consume_prefix_in_state_dict_if_present(checkpoint, "module.")
|
| 221 |
hubert.load_state_dict(checkpoint)
|
| 222 |
hubert.eval().to(dev)
|
| 223 |
return hubert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/.gitignore
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
output/
|
| 2 |
-
audio/*
|
| 3 |
-
!audio/test.wav
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/README.md
DELETED
|
@@ -1,56 +0,0 @@
|
|
| 1 |
-
# icassp2022-vocal-transcription
|
| 2 |
-
Companion code for the paper:
|
| 3 |
-
Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, Juhan Nam *"Pseudo-Label Transfer from Frame-level to Note-level in a Teacher-student Framework for Singing Transcription from Polyphonic Music"*, ICASSP2022, Singapore <[link](https://ieeexplore.ieee.org/document/9747147)>
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
## Abstract
|
| 7 |
-
|
| 8 |
-
Lack of large-scale note-level labeled data is the major obstacle to singing transcription from polyphonic music. We address the issue by using pseudo labels from vocal pitch estimation models given unlabeled data. The proposed method first converts the frame-level pseudo labels to note-level through pitch and rhythm quantization steps. Then, it further improves the label quality through self- training in a teacher-student framework.
|
| 9 |
-
|
| 10 |
-
<img src="./img/ICASSP2022-fig1-2.png" width="70%">
|
| 11 |
-
|
| 12 |
-
To validate the method, we conduct various experiment settings by investigating two vocal pitch estimation models as pseudo-label generators, two setups of teacher-student frameworks, and the number of iterations in self-training. The results show that the proposed method can effectively leverage large-scale unlabeled audio data and self-training with the noisy student model helps to improve performance. Finally, we show that the model trained with only unlabeled data has comparable performance to previous works and the model trained with addi- tional labeled data achieves higher accuracy than the model trained with only labeled data.
|
| 13 |
-
|
| 14 |
-
## Demo video
|
| 15 |
-
- <[Youtube Link 1](https://www.youtube.com/watch?v=wlD-GAGuj0M "Demo 1: Singing transcription from polpyphonic music")> You&I (IU)
|
| 16 |
-
- <[Youtube Link 2](https://youtu.be/iitOC4vuC8U "Demo 2: Singing transcription from polpyphonic music")> You in my arms (Myung jin Moon)
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
## Dependencies
|
| 20 |
-
|
| 21 |
-
- OS: LINUX
|
| 22 |
-
- Programming language: Python 3.6+
|
| 23 |
-
- Python Library
|
| 24 |
-
- Keras 2.7.0 (Deep Learning library)
|
| 25 |
-
- tensorflow 2.5.0 (Deep Learning library)
|
| 26 |
-
- Librosa 0.8.1 (for STFT)
|
| 27 |
-
- pydub 0.25.1 (for loading audio and resampling)
|
| 28 |
-
- pretty-midi (for handling midi data)
|
| 29 |
-
- Numpy, SciPy
|
| 30 |
-
|
| 31 |
-
- Hardware
|
| 32 |
-
- 1 GPU : GeForce GTX 3090
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
## Using STP from the command line
|
| 36 |
-
```
|
| 37 |
-
$ python singing_transcription.py -i ../audio/test.wav -o ../output
|
| 38 |
-
|
| 39 |
-
[optional arguments]
|
| 40 |
-
-i path_audio Path to input audio file. (default: '../audio/pop1.wav')
|
| 41 |
-
-o pathsave Path to folder for saving .mid file (default: '../output')
|
| 42 |
-
-ot output_type (optional) Output type: midi or frame-level pitch score(fps) (default: 'midi')
|
| 43 |
-
```
|
| 44 |
-
- output example: ADC04-pop1.wav
|
| 45 |
-
<img src="./img/example_pop1_midi.png" width="100%">
|
| 46 |
-
# Citation
|
| 47 |
-
If you find our work useful, please consider citing our paper.
|
| 48 |
-
|
| 49 |
-
```
|
| 50 |
-
@inproceedings{kum2022pseudo,
|
| 51 |
-
title={Pseudo-Label Transfer from Frame-Level to Note-Level in a Teacher-Student Framework for Singing Transcription from Polyphonic Music},
|
| 52 |
-
author={Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, and Juhan Nam},
|
| 53 |
-
booktitle={Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
|
| 54 |
-
year={2022}
|
| 55 |
-
}
|
| 56 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/__init__.py
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
from .src import singing_transcription
|
| 2 |
-
|
| 3 |
-
transcribe = singing_transcription.get_frame_level_output
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc
DELETED
|
Binary file (254 Bytes)
|
|
|
icassp2022_vocal_transcription/data/weight_ST.hdf5
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3ba38c046af48a359575c1a312d931966e56d94013ad56dd91f2de5219afa8a4
|
| 3 |
-
size 17535208
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/data/x_train_mean.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f977a72104d19c3b92c764a4fe1335f411ffc331bb6f81ec2420016f07fa772c
|
| 3 |
-
size 4232
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/data/x_train_std.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3a120cbf8bc8e62544f7b0ce1185b0244f3c6971fd50b3092c66a0fda1f5405a
|
| 3 |
-
size 4232
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png
DELETED
|
Binary file (26.9 kB)
|
|
|
icassp2022_vocal_transcription/img/example_pop1_midi.png
DELETED
|
Binary file (136 kB)
|
|
|
icassp2022_vocal_transcription/requirements.txt
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
keras==2.7.0
|
| 2 |
-
numpy==1.19.5
|
| 3 |
-
librosa==0.8.1
|
| 4 |
-
mir-eval==0.6
|
| 5 |
-
pretty-midi==0.2.9
|
| 6 |
-
pydub==0.25.1
|
| 7 |
-
scipy==1.7.3
|
| 8 |
-
tensorflow==2.5.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/src/MIDI.py
DELETED
|
@@ -1,141 +0,0 @@
|
|
| 1 |
-
#%%
|
| 2 |
-
import pretty_midi
|
| 3 |
-
import numpy as np
|
| 4 |
-
import librosa.display
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
#%%
|
| 8 |
-
def plot_piano_roll(pm, start_pitch, end_pitch, fs=100):
|
| 9 |
-
""" Plot piano roll from .mid file
|
| 10 |
-
----------
|
| 11 |
-
Parameters:
|
| 12 |
-
pm: RWC, MDB, iKala, DSD100
|
| 13 |
-
start/end_pitch: lowest/highest note (float)
|
| 14 |
-
fs: sampling freq. (int)
|
| 15 |
-
|
| 16 |
-
"""
|
| 17 |
-
# Use librosa's specshow function for displaying the piano roll
|
| 18 |
-
librosa.display.specshow(
|
| 19 |
-
pm.get_piano_roll(fs)[start_pitch:end_pitch],
|
| 20 |
-
hop_length=1,
|
| 21 |
-
sr=fs,
|
| 22 |
-
x_axis="time",
|
| 23 |
-
y_axis="cqt_note",
|
| 24 |
-
fmin=pretty_midi.note_number_to_hz(start_pitch),
|
| 25 |
-
)
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def midi_to_note(file_name, pitch_shift, fs=100, start_note=40, end_note=95):
|
| 29 |
-
""" Convert .mid to note
|
| 30 |
-
----------
|
| 31 |
-
Parameters:
|
| 32 |
-
file_name: '.mid' (str)
|
| 33 |
-
pitch_sifht: shift the pitch to adjust notes correctly (int)
|
| 34 |
-
fs: sampling freq. (int)
|
| 35 |
-
start/end_pitch: lowest/highest note(int)
|
| 36 |
-
|
| 37 |
-
----------
|
| 38 |
-
Returns:
|
| 39 |
-
notes: note/10ms (array)
|
| 40 |
-
"""
|
| 41 |
-
|
| 42 |
-
pm = pretty_midi.PrettyMIDI(file_name)
|
| 43 |
-
frame_note = pm.get_piano_roll(fs)[start_note:end_note]
|
| 44 |
-
|
| 45 |
-
length_audio = frame_note.shape[1]
|
| 46 |
-
notes = np.zeros(length_audio)
|
| 47 |
-
|
| 48 |
-
for i in range(length_audio):
|
| 49 |
-
note_tmp = np.argmax(frame_note[:, i])
|
| 50 |
-
if note_tmp > 0:
|
| 51 |
-
notes[i] = (note_tmp + start_note) + pitch_shift
|
| 52 |
-
# note[i] = 2 ** ((note_tmp -69) / 12.) * 440
|
| 53 |
-
return notes
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def midi_to_segment(filename):
|
| 57 |
-
""" Convert .mid to segment
|
| 58 |
-
----------
|
| 59 |
-
Parameters:
|
| 60 |
-
filename: .mid (str)
|
| 61 |
-
|
| 62 |
-
----------
|
| 63 |
-
Returns:
|
| 64 |
-
segments: [start(s),end(s),pitch] (list)
|
| 65 |
-
"""
|
| 66 |
-
|
| 67 |
-
pm = pretty_midi.PrettyMIDI(filename)
|
| 68 |
-
segment = []
|
| 69 |
-
for note in pm.instruments[0].notes:
|
| 70 |
-
segment.append([note.start, note.end, note.pitch])
|
| 71 |
-
return segment
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
def segment_to_midi(segments, path_output, tempo=120):
|
| 75 |
-
""" Convert segment to .mid
|
| 76 |
-
----------
|
| 77 |
-
Parameters:
|
| 78 |
-
segments: [start(s),end(s),pitch] (list)
|
| 79 |
-
path_output: path of save file (str)
|
| 80 |
-
"""
|
| 81 |
-
pm = pretty_midi.PrettyMIDI(initial_tempo=int(tempo))
|
| 82 |
-
inst_program = pretty_midi.instrument_name_to_program("Acoustic Grand Piano")
|
| 83 |
-
inst = pretty_midi.Instrument(program=inst_program)
|
| 84 |
-
for segment in segments:
|
| 85 |
-
note = pretty_midi.Note(
|
| 86 |
-
velocity=100, start=segment[0], end=segment[1], pitch=np.int(segment[2])
|
| 87 |
-
)
|
| 88 |
-
inst.notes.append(note)
|
| 89 |
-
pm.instruments.append(inst)
|
| 90 |
-
pm.write(f"{path_output}")
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
def note_to_segment(note):
|
| 94 |
-
""" Convert note to segment
|
| 95 |
-
----------
|
| 96 |
-
Parameters:
|
| 97 |
-
note: note/10ms (array)
|
| 98 |
-
----------
|
| 99 |
-
Returns:
|
| 100 |
-
segments: [start(s),end(s),pitch] (list)
|
| 101 |
-
"""
|
| 102 |
-
startSeg = []
|
| 103 |
-
endSeg = []
|
| 104 |
-
notes = []
|
| 105 |
-
flag = -1
|
| 106 |
-
|
| 107 |
-
if note[0] > 0:
|
| 108 |
-
startSeg.append(0)
|
| 109 |
-
notes.append(np.int(note[0]))
|
| 110 |
-
flag *= -1
|
| 111 |
-
for i in range(0, len(note) - 1):
|
| 112 |
-
if note[i] != note[i + 1]:
|
| 113 |
-
if flag < 0:
|
| 114 |
-
startSeg.append(0.01 * (i + 1))
|
| 115 |
-
notes.append(np.int(note[i + 1]))
|
| 116 |
-
flag *= -1
|
| 117 |
-
else:
|
| 118 |
-
if note[i + 1] == 0:
|
| 119 |
-
endSeg.append(0.01 * i)
|
| 120 |
-
flag *= -1
|
| 121 |
-
else:
|
| 122 |
-
endSeg.append(0.01 * i)
|
| 123 |
-
startSeg.append(0.01 * (i + 1))
|
| 124 |
-
notes.append(np.int(note[i + 1]))
|
| 125 |
-
|
| 126 |
-
return list(zip(startSeg, endSeg, notes))
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
def note2Midi(frame_level_pitchscroe, path_output, tempo):
|
| 130 |
-
# note = np.loadtxt(path_input_note)
|
| 131 |
-
# note = note[:, 1]
|
| 132 |
-
segment = note_to_segment(frame_level_pitchscroe)
|
| 133 |
-
segment_to_midi(segment, path_output=path_output, tempo=tempo)
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
# def note2Midi(path_input_note, path_output, tempo):
|
| 137 |
-
# note = np.loadtxt(path_input_note)
|
| 138 |
-
# note = note[:, 1]
|
| 139 |
-
# segment = note_to_segment(note)
|
| 140 |
-
# segment_to_midi(segment, path_output=path_output, tempo=tempo)
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/src/__init__.py
DELETED
|
File without changes
|
icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc
DELETED
|
Binary file (3.48 kB)
|
|
|
icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc
DELETED
|
Binary file (165 Bytes)
|
|
|
icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc
DELETED
|
Binary file (1.74 kB)
|
|
|
icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc
DELETED
|
Binary file (3.1 kB)
|
|
|
icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc
DELETED
|
Binary file (4.92 kB)
|
|
|
icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc
DELETED
|
Binary file (3.99 kB)
|
|
|
icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc
DELETED
|
Binary file (1.5 kB)
|
|
|
icassp2022_vocal_transcription/src/featureExtraction.py
DELETED
|
@@ -1,61 +0,0 @@
|
|
| 1 |
-
# -*- coding: utf-8 -*-
|
| 2 |
-
import librosa
|
| 3 |
-
from pydub import AudioSegment
|
| 4 |
-
import pathlib
|
| 5 |
-
|
| 6 |
-
# from pydub.playback import play
|
| 7 |
-
import numpy as np
|
| 8 |
-
import os
|
| 9 |
-
|
| 10 |
-
PATH_PROJECT = os.path.dirname(os.path.realpath(__file__))
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def read_audio(filepath, sr=None):
|
| 14 |
-
path = pathlib.Path(filepath)
|
| 15 |
-
extenstion = path.suffix.replace(".", "")
|
| 16 |
-
if extenstion == "mp3":
|
| 17 |
-
sound = AudioSegment.from_mp3(filepath)
|
| 18 |
-
else:
|
| 19 |
-
sound = AudioSegment.from_file(filepath)
|
| 20 |
-
# sound = sound[start * 1000 : end * 1000]
|
| 21 |
-
sound = sound.set_channels(1)
|
| 22 |
-
if sr == None:
|
| 23 |
-
sr = sound.frame_rate
|
| 24 |
-
sound = sound.set_frame_rate(sr)
|
| 25 |
-
samples = sound.get_array_of_samples()
|
| 26 |
-
y = np.array(samples).T.astype(np.float32)
|
| 27 |
-
|
| 28 |
-
return y, sr
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def spec_extraction(file_name, win_size):
|
| 32 |
-
|
| 33 |
-
y, _ = read_audio(file_name, sr=8000)
|
| 34 |
-
|
| 35 |
-
S = librosa.core.stft(y, n_fft=1024, hop_length=80, win_length=1024)
|
| 36 |
-
x_spec = np.abs(S)
|
| 37 |
-
x_spec = librosa.core.power_to_db(x_spec, ref=np.max)
|
| 38 |
-
x_spec = x_spec.astype(np.float32)
|
| 39 |
-
num_frames = x_spec.shape[1]
|
| 40 |
-
|
| 41 |
-
# for padding
|
| 42 |
-
padNum = num_frames % win_size
|
| 43 |
-
if padNum != 0:
|
| 44 |
-
len_pad = win_size - padNum
|
| 45 |
-
padding_feature = np.zeros(shape=(513, len_pad))
|
| 46 |
-
x_spec = np.concatenate((x_spec, padding_feature), axis=1)
|
| 47 |
-
num_frames = num_frames + len_pad
|
| 48 |
-
|
| 49 |
-
x_test = []
|
| 50 |
-
for j in range(0, num_frames, win_size):
|
| 51 |
-
x_test_tmp = x_spec[:, range(j, j + win_size)].T
|
| 52 |
-
x_test.append(x_test_tmp)
|
| 53 |
-
x_test = np.array(x_test)
|
| 54 |
-
|
| 55 |
-
# for standardization
|
| 56 |
-
path_project = pathlib.Path(__file__).parent.parent
|
| 57 |
-
x_train_mean = np.load(f"{path_project}/data/x_train_mean.npy")
|
| 58 |
-
x_train_std = np.load(f"{path_project}/data/x_train_std.npy")
|
| 59 |
-
x_test = (x_test - x_train_mean) / (x_train_std + 0.0001)
|
| 60 |
-
x_test = x_test[:, :, :, np.newaxis]
|
| 61 |
-
return x_test, x_spec
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/src/model.py
DELETED
|
@@ -1,139 +0,0 @@
|
|
| 1 |
-
# import keras.backend as KK
|
| 2 |
-
import math
|
| 3 |
-
from tensorflow.keras import backend as K
|
| 4 |
-
from tensorflow.keras.regularizers import l2
|
| 5 |
-
from tensorflow.keras.models import Model
|
| 6 |
-
from tensorflow.keras.layers import (
|
| 7 |
-
Conv2D,
|
| 8 |
-
MaxPooling2D,
|
| 9 |
-
BatchNormalization,
|
| 10 |
-
LeakyReLU,
|
| 11 |
-
Dropout,
|
| 12 |
-
LSTM,
|
| 13 |
-
Reshape,
|
| 14 |
-
Bidirectional,
|
| 15 |
-
TimeDistributed,
|
| 16 |
-
Input,
|
| 17 |
-
add,
|
| 18 |
-
concatenate,
|
| 19 |
-
Lambda,
|
| 20 |
-
Dense,
|
| 21 |
-
Activation,
|
| 22 |
-
)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
# --------------------------------------------------------------------------------
|
| 26 |
-
def ResNet_Block(input, block_id, filterNum):
|
| 27 |
-
x = Conv2D(
|
| 28 |
-
filterNum,
|
| 29 |
-
(1, 1),
|
| 30 |
-
name="conv_s" + str(block_id) + "_1x1",
|
| 31 |
-
padding="same",
|
| 32 |
-
kernel_initializer="he_normal",
|
| 33 |
-
use_bias=False,
|
| 34 |
-
)(input)
|
| 35 |
-
shortcut = BatchNormalization()(x)
|
| 36 |
-
x = LeakyReLU(0.01)(shortcut)
|
| 37 |
-
|
| 38 |
-
x = Conv2D(
|
| 39 |
-
filterNum,
|
| 40 |
-
(3, 3),
|
| 41 |
-
name="conv" + str(block_id) + "_1",
|
| 42 |
-
padding="same",
|
| 43 |
-
kernel_initializer="he_normal",
|
| 44 |
-
use_bias=False,
|
| 45 |
-
kernel_regularizer=l2(1e-5),
|
| 46 |
-
)(x)
|
| 47 |
-
x = BatchNormalization()(x)
|
| 48 |
-
x = LeakyReLU(0.01)(x)
|
| 49 |
-
|
| 50 |
-
# x = Dropout(0.3)(x)
|
| 51 |
-
|
| 52 |
-
x = Conv2D(
|
| 53 |
-
filterNum,
|
| 54 |
-
(3, 3),
|
| 55 |
-
name="conv" + str(block_id) + "_2",
|
| 56 |
-
padding="same",
|
| 57 |
-
kernel_initializer="he_normal",
|
| 58 |
-
use_bias=False,
|
| 59 |
-
kernel_regularizer=l2(1e-5),
|
| 60 |
-
)(x)
|
| 61 |
-
x = BatchNormalization()(x)
|
| 62 |
-
x = LeakyReLU(0.01)(x)
|
| 63 |
-
|
| 64 |
-
x = Conv2D(
|
| 65 |
-
filterNum,
|
| 66 |
-
(1, 1),
|
| 67 |
-
name="conv_f" + str(block_id) + "_1x1",
|
| 68 |
-
padding="same",
|
| 69 |
-
kernel_initializer="he_normal",
|
| 70 |
-
use_bias=False,
|
| 71 |
-
)(x)
|
| 72 |
-
x = BatchNormalization()(x)
|
| 73 |
-
|
| 74 |
-
x = add([x, shortcut])
|
| 75 |
-
x = LeakyReLU(0.01)(x)
|
| 76 |
-
x = MaxPooling2D((1, 4))(x)
|
| 77 |
-
return x
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
def melody_ResNet_JDC(num_spec, window_size, note_res):
|
| 81 |
-
|
| 82 |
-
num_output = int(55 * 2 ** (math.log(note_res, 2)) + 2)
|
| 83 |
-
input = Input(shape=(window_size, num_spec, 1))
|
| 84 |
-
block_1 = ResNet_Block(input=input, block_id=1, filterNum=64)
|
| 85 |
-
block_2 = ResNet_Block(input=block_1, block_id=2, filterNum=128)
|
| 86 |
-
block_3 = ResNet_Block(input=block_2, block_id=3, filterNum=192)
|
| 87 |
-
block_4 = ResNet_Block(input=block_3, block_id=4, filterNum=256)
|
| 88 |
-
block_4_dp = Dropout(0.3)(block_4)
|
| 89 |
-
|
| 90 |
-
keras_shape = K.int_shape(block_4)
|
| 91 |
-
numOutput_P = keras_shape[2] * keras_shape[3]
|
| 92 |
-
output_tmp = Reshape((window_size, numOutput_P))(block_4_dp)
|
| 93 |
-
|
| 94 |
-
# voicing
|
| 95 |
-
block_1 = MaxPooling2D((1, 4 ** 3))(block_1)
|
| 96 |
-
block_2 = MaxPooling2D((1, 4 ** 2))(block_2)
|
| 97 |
-
block_3 = MaxPooling2D((1, 4 ** 1))(block_3)
|
| 98 |
-
joint = concatenate([block_1, block_2, block_3, block_4])
|
| 99 |
-
joint = Dropout(0.3)(joint)
|
| 100 |
-
joint = Conv2D(
|
| 101 |
-
256,
|
| 102 |
-
(1, 1),
|
| 103 |
-
padding="same",
|
| 104 |
-
kernel_initializer="he_normal",
|
| 105 |
-
use_bias=False,
|
| 106 |
-
kernel_regularizer=l2(1e-5),
|
| 107 |
-
)(joint)
|
| 108 |
-
joint = BatchNormalization()(joint)
|
| 109 |
-
joint = LeakyReLU(0.01)(joint)
|
| 110 |
-
|
| 111 |
-
keras_shape2 = K.int_shape(joint)
|
| 112 |
-
num_V = keras_shape2[2] * keras_shape2[3]
|
| 113 |
-
|
| 114 |
-
output_V_tmp = Reshape((window_size, num_V))(joint)
|
| 115 |
-
output_V_tmp = Bidirectional(LSTM(32, return_sequences=True, stateful=False, dropout=0.2))(
|
| 116 |
-
output_V_tmp
|
| 117 |
-
)
|
| 118 |
-
output_V = TimeDistributed(Dense(2))(output_V_tmp)
|
| 119 |
-
output_V = TimeDistributed(Activation("softmax"), name="output_AUX_V")(output_V)
|
| 120 |
-
|
| 121 |
-
# output
|
| 122 |
-
output_tmp = Bidirectional(LSTM(256, return_sequences=True, dropout=0.2))(output_tmp)
|
| 123 |
-
output_tmp = concatenate([output_tmp, output_V_tmp])
|
| 124 |
-
output = TimeDistributed(Dense(num_output))(output_tmp)
|
| 125 |
-
output = TimeDistributed(Activation("softmax"), name="output")(output)
|
| 126 |
-
|
| 127 |
-
output_NS = Lambda(lambda x: x[:, :, 0])(output)
|
| 128 |
-
output_NS = Reshape((window_size, 1))(output_NS)
|
| 129 |
-
|
| 130 |
-
output_S = Lambda(lambda x: 1 - x[:, :, 0])(output)
|
| 131 |
-
output_S = Reshape((window_size, 1))(output_S)
|
| 132 |
-
output_PV = concatenate([output_NS, output_S])
|
| 133 |
-
|
| 134 |
-
output_V_F = concatenate([output_V, output_PV])
|
| 135 |
-
output_V_F = TimeDistributed(Dense(2))(output_V_F)
|
| 136 |
-
output_V_F = TimeDistributed(Activation("softmax"), name="output_V")(output_V_F)
|
| 137 |
-
model = Model(inputs=input, outputs=[output, output_V_F])
|
| 138 |
-
|
| 139 |
-
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/src/quantization.py
DELETED
|
@@ -1,217 +0,0 @@
|
|
| 1 |
-
# %%
|
| 2 |
-
import numpy as np
|
| 3 |
-
import librosa
|
| 4 |
-
import librosa.display
|
| 5 |
-
|
| 6 |
-
from scipy.signal import medfilt
|
| 7 |
-
from matplotlib import pyplot as plt
|
| 8 |
-
from .featureExtraction import read_audio
|
| 9 |
-
from .utils import *
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# %%
|
| 13 |
-
def calc_tempo(path_audio):
|
| 14 |
-
""" Calculate audio tempo
|
| 15 |
-
----------
|
| 16 |
-
Parameters:
|
| 17 |
-
path_audio: str
|
| 18 |
-
|
| 19 |
-
----------
|
| 20 |
-
Returns:
|
| 21 |
-
tempo: float
|
| 22 |
-
|
| 23 |
-
"""
|
| 24 |
-
target_sr = 22050
|
| 25 |
-
y, _ = read_audio(path_audio, sr=target_sr)
|
| 26 |
-
onset_strength = librosa.onset.onset_strength(y, sr=target_sr)
|
| 27 |
-
tempo = librosa.beat.tempo(onset_envelope=onset_strength, sr=target_sr)
|
| 28 |
-
return tempo
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def one_beat_frame_size(tempo):
|
| 32 |
-
""" Calculate frame size of 1 beat
|
| 33 |
-
----------
|
| 34 |
-
Parameters:
|
| 35 |
-
tempo: float
|
| 36 |
-
|
| 37 |
-
----------
|
| 38 |
-
Returns:
|
| 39 |
-
tempo: int
|
| 40 |
-
|
| 41 |
-
"""
|
| 42 |
-
return np.int(np.round(60 / tempo * 100))
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
def median_filter_pitch(pitch, medfilt_size, weight):
|
| 46 |
-
""" Smoothing pitch using median filter
|
| 47 |
-
----------
|
| 48 |
-
Parameters:
|
| 49 |
-
pitch: array
|
| 50 |
-
medfilt_size: int
|
| 51 |
-
weight: float
|
| 52 |
-
|
| 53 |
-
----------
|
| 54 |
-
Returns:
|
| 55 |
-
pitch: array
|
| 56 |
-
|
| 57 |
-
"""
|
| 58 |
-
|
| 59 |
-
medfilt_size = np.int(medfilt_size * weight)
|
| 60 |
-
if medfilt_size % 2 == 0:
|
| 61 |
-
medfilt_size += 1
|
| 62 |
-
return np.round(medfilt(pitch, medfilt_size))
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def clean_note_frames(note, min_note_len=5):
|
| 66 |
-
""" Remove short pitch frames
|
| 67 |
-
----------
|
| 68 |
-
Parameters:
|
| 69 |
-
note: array
|
| 70 |
-
min_note_len: int
|
| 71 |
-
|
| 72 |
-
----------
|
| 73 |
-
Returns:
|
| 74 |
-
output: array
|
| 75 |
-
|
| 76 |
-
"""
|
| 77 |
-
|
| 78 |
-
prev_pitch = 0
|
| 79 |
-
prev_pitch_start = 0
|
| 80 |
-
output = np.copy(note)
|
| 81 |
-
for i in range(len(note)):
|
| 82 |
-
pitch = note[i]
|
| 83 |
-
if pitch != prev_pitch:
|
| 84 |
-
prev_pitch_duration = i - prev_pitch_start
|
| 85 |
-
if prev_pitch_duration < min_note_len:
|
| 86 |
-
output[prev_pitch_start:i] = [0] * prev_pitch_duration
|
| 87 |
-
prev_pitch = pitch
|
| 88 |
-
prev_pitch_start = i
|
| 89 |
-
return output
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
def makeSegments(note):
|
| 93 |
-
""" Make segments of notes
|
| 94 |
-
----------
|
| 95 |
-
Parameters:
|
| 96 |
-
note: array
|
| 97 |
-
|
| 98 |
-
----------
|
| 99 |
-
Returns:
|
| 100 |
-
startSeg: starting points (array)
|
| 101 |
-
endSeg: ending points (array)
|
| 102 |
-
|
| 103 |
-
"""
|
| 104 |
-
startSeg = []
|
| 105 |
-
endSeg = []
|
| 106 |
-
flag = -1
|
| 107 |
-
if note[0] > 0:
|
| 108 |
-
startSeg.append(0)
|
| 109 |
-
flag *= -1
|
| 110 |
-
for i in range(0, len(note) - 1):
|
| 111 |
-
if note[i] != note[i + 1]:
|
| 112 |
-
if flag < 0:
|
| 113 |
-
startSeg.append(i + 1)
|
| 114 |
-
flag *= -1
|
| 115 |
-
else:
|
| 116 |
-
if note[i + 1] == 0:
|
| 117 |
-
endSeg.append(i)
|
| 118 |
-
flag *= -1
|
| 119 |
-
else:
|
| 120 |
-
endSeg.append(i)
|
| 121 |
-
startSeg.append(i + 1)
|
| 122 |
-
return startSeg, endSeg
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
def remove_short_segment(idx, note_cleaned, start, end, minLength):
|
| 126 |
-
""" Remove short segments
|
| 127 |
-
----------
|
| 128 |
-
Parameters:
|
| 129 |
-
idx: (int)
|
| 130 |
-
note_cleaned: (array)
|
| 131 |
-
start: starting points (array)
|
| 132 |
-
end: ending points (array)
|
| 133 |
-
minLength: (int)
|
| 134 |
-
|
| 135 |
-
----------
|
| 136 |
-
Returns:
|
| 137 |
-
note_cleaned: (array)
|
| 138 |
-
|
| 139 |
-
"""
|
| 140 |
-
|
| 141 |
-
len_seg = end[idx] - start[idx]
|
| 142 |
-
if len_seg < minLength:
|
| 143 |
-
if (start[idx + 1] - end[idx] > minLength) and (start[idx] - end[idx - 1] > minLength):
|
| 144 |
-
note_cleaned[start[idx] : end[idx] + 1] = [0] * (len_seg + 1)
|
| 145 |
-
return note_cleaned
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
def remove_octave_error(idx, note_cleaned, start, end):
|
| 149 |
-
""" Remove octave error
|
| 150 |
-
----------
|
| 151 |
-
Parameters:
|
| 152 |
-
idx: (int)
|
| 153 |
-
note_cleaned: (array)
|
| 154 |
-
start: starting points (array)
|
| 155 |
-
end: ending points (array)
|
| 156 |
-
|
| 157 |
-
----------
|
| 158 |
-
Returns:
|
| 159 |
-
note_cleaned: (array)
|
| 160 |
-
|
| 161 |
-
"""
|
| 162 |
-
len_seg = end[idx] - start[idx]
|
| 163 |
-
if (note_cleaned[start[idx - 1]] == note_cleaned[start[idx + 1]]) and (
|
| 164 |
-
note_cleaned[start[idx]] != note_cleaned[start[idx + 1]]
|
| 165 |
-
):
|
| 166 |
-
if np.abs(note_cleaned[start[idx]] - note_cleaned[start[idx + 1]]) % 12 == 0:
|
| 167 |
-
note_cleaned[start[idx] - 1 : end[idx] + 1] = [note_cleaned[start[idx + 1]]] * (
|
| 168 |
-
len_seg + 2
|
| 169 |
-
)
|
| 170 |
-
return note_cleaned
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
def clean_segment(note, minLength):
|
| 174 |
-
""" clean note segments
|
| 175 |
-
----------
|
| 176 |
-
Parameters:
|
| 177 |
-
note: (array)
|
| 178 |
-
minLength: (int)
|
| 179 |
-
|
| 180 |
-
----------
|
| 181 |
-
Returns:
|
| 182 |
-
note_cleaned: (array)
|
| 183 |
-
|
| 184 |
-
"""
|
| 185 |
-
|
| 186 |
-
note_cleaned = np.copy(note)
|
| 187 |
-
start, end = makeSegments(note_cleaned)
|
| 188 |
-
|
| 189 |
-
for i in range(1, len(start) - 1):
|
| 190 |
-
note_cleaned = remove_short_segment(i, note_cleaned, start, end, minLength)
|
| 191 |
-
note_cleaned = remove_octave_error(i, note_cleaned, start, end)
|
| 192 |
-
return note_cleaned
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
def refine_note(est_note, tempo):
|
| 196 |
-
""" main: refine note segments
|
| 197 |
-
----------
|
| 198 |
-
Parameters:
|
| 199 |
-
est_note: (array)
|
| 200 |
-
tempo: (float)
|
| 201 |
-
|
| 202 |
-
----------
|
| 203 |
-
Returns:
|
| 204 |
-
est_pitch_mf3_v: (array)
|
| 205 |
-
|
| 206 |
-
"""
|
| 207 |
-
one_beat_size = one_beat_frame_size(tempo)
|
| 208 |
-
est_note_mf1 = median_filter_pitch(est_note, one_beat_size, 1 / 8)
|
| 209 |
-
est_note_mf2 = median_filter_pitch(est_note_mf1, one_beat_size, 1 / 4)
|
| 210 |
-
est_note_mf3 = median_filter_pitch(est_note_mf2, one_beat_size, 1 / 3)
|
| 211 |
-
|
| 212 |
-
vocing = est_note_mf1 > 0
|
| 213 |
-
est_pitch_mf3_v = vocing * est_note_mf3
|
| 214 |
-
est_pitch_mf3_v = clean_note_frames(est_pitch_mf3_v, int(one_beat_size * 1 / 8))
|
| 215 |
-
est_pitch_mf3_v = clean_segment(est_pitch_mf3_v, int(one_beat_size * 1 / 4))
|
| 216 |
-
return est_pitch_mf3_v
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/src/singing_transcription.py
DELETED
|
@@ -1,147 +0,0 @@
|
|
| 1 |
-
# -*- coding: utf-8 -*-
|
| 2 |
-
# %%
|
| 3 |
-
import argparse
|
| 4 |
-
import numpy as np
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from .model import *
|
| 7 |
-
from .featureExtraction import *
|
| 8 |
-
from .quantization import *
|
| 9 |
-
from .utils import *
|
| 10 |
-
from .MIDI import *
|
| 11 |
-
|
| 12 |
-
# %%
|
| 13 |
-
class SingingTranscription:
|
| 14 |
-
def __init__(self):
|
| 15 |
-
|
| 16 |
-
self.PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
|
| 17 |
-
self.num_spec = 513
|
| 18 |
-
self.window_size = 31
|
| 19 |
-
self.note_res = 1
|
| 20 |
-
self.batch_size = 64
|
| 21 |
-
|
| 22 |
-
def load_model(self, path_weight, TF_summary=False):
|
| 23 |
-
|
| 24 |
-
model = melody_ResNet_JDC(self.num_spec, self.window_size, self.note_res)
|
| 25 |
-
model.load_weights(path_weight)
|
| 26 |
-
if TF_summary == True:
|
| 27 |
-
print(model.summary())
|
| 28 |
-
return model
|
| 29 |
-
|
| 30 |
-
def predict_melody(self, model_ST, filepath):
|
| 31 |
-
pitch_range = np.arange(40, 95 + 1.0 / self.note_res, 1.0 / self.note_res)
|
| 32 |
-
pitch_range = np.concatenate([np.zeros(1), pitch_range])
|
| 33 |
-
|
| 34 |
-
""" Features extraction"""
|
| 35 |
-
X_test, _ = spec_extraction(file_name=filepath, win_size=self.window_size)
|
| 36 |
-
|
| 37 |
-
""" melody predict"""
|
| 38 |
-
y_predict = model_ST.predict(X_test, batch_size=self.batch_size, verbose=1)
|
| 39 |
-
y_predict = y_predict[0] # [0]:note, [1]:vocing
|
| 40 |
-
y_shape = y_predict.shape
|
| 41 |
-
num_total = y_shape[0] * y_shape[1]
|
| 42 |
-
y_predict = np.reshape(y_predict, (num_total, y_shape[2]))
|
| 43 |
-
|
| 44 |
-
est_MIDI = np.zeros(num_total)
|
| 45 |
-
est_freq = np.zeros(num_total)
|
| 46 |
-
for i in range(num_total):
|
| 47 |
-
index_predict = np.argmax(y_predict[i])
|
| 48 |
-
pitch_MIDI = pitch_range[np.int32(index_predict)]
|
| 49 |
-
if pitch_MIDI >= 40 and pitch_MIDI <= 95:
|
| 50 |
-
est_MIDI[i] = pitch_MIDI
|
| 51 |
-
# est_freq[i] = 2 ** ((pitch_MIDI - 69) / 12.0) * 440
|
| 52 |
-
return est_MIDI
|
| 53 |
-
|
| 54 |
-
def save_output_frame_level(self, pitch_score, path_save, note_or_freq="note"):
|
| 55 |
-
check_and_make_dir(Path(path_save))
|
| 56 |
-
f = open(path_save, "w")
|
| 57 |
-
|
| 58 |
-
assert (note_or_freq == "freq") or (note_or_freq == "note"), "please check 'note' or 'freq"
|
| 59 |
-
if note_or_freq == "freq":
|
| 60 |
-
for j in range(len(pitch_score)):
|
| 61 |
-
if pitch_score[j] > 0:
|
| 62 |
-
pitch_score[j] = 2 ** ((pitch_score[j] - 69) / 12.0) * 440
|
| 63 |
-
est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
|
| 64 |
-
f.write(est)
|
| 65 |
-
elif note_or_freq == "note":
|
| 66 |
-
for j in range(len(pitch_score)):
|
| 67 |
-
est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
|
| 68 |
-
f.write(est)
|
| 69 |
-
|
| 70 |
-
f.close()
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
def main(args):
|
| 74 |
-
ST = SingingTranscription()
|
| 75 |
-
|
| 76 |
-
""" load model """
|
| 77 |
-
model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
|
| 78 |
-
|
| 79 |
-
""" predict note (time-freq) """
|
| 80 |
-
path_audio = args.path_audio
|
| 81 |
-
fl_note = ST.predict_melody(model_ST, path_audio) # frame-level pitch score
|
| 82 |
-
|
| 83 |
-
""" post-processing """
|
| 84 |
-
tempo = calc_tempo(path_audio)
|
| 85 |
-
refined_fl_note = refine_note(fl_note, tempo) # frame-level pitch score
|
| 86 |
-
|
| 87 |
-
""" convert frame-level pitch score to note-level (time-axis) """
|
| 88 |
-
segment = note_to_segment(refined_fl_note) # note-level pitch score
|
| 89 |
-
|
| 90 |
-
""" save ouput to .mid """
|
| 91 |
-
filename = get_filename_wo_extension(path_audio)
|
| 92 |
-
path_output = f"{args.path_save}/{filename}.mid"
|
| 93 |
-
segment_to_midi(segment, path_output=path_output, tempo=tempo)
|
| 94 |
-
|
| 95 |
-
if args.output_type == "fps":
|
| 96 |
-
path_note = f"{args.path_save}/{filename}.txt"
|
| 97 |
-
ST.save_output_frame_level(refined_fl_note, path_note, note_or_freq="freq")
|
| 98 |
-
|
| 99 |
-
print(f"\n========= DONE =========")
|
| 100 |
-
print(f"input: '{path_audio}'")
|
| 101 |
-
print(f"output: '{path_output}'")
|
| 102 |
-
|
| 103 |
-
ST = SingingTranscription()
|
| 104 |
-
|
| 105 |
-
""" load model """
|
| 106 |
-
model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
def get_frame_level_output(wav_path):
|
| 110 |
-
""" predict note (time-freq) """
|
| 111 |
-
path_audio = wav_path
|
| 112 |
-
fl_note = ST.predict_melody(model_ST, path_audio) # frame-level pitch score
|
| 113 |
-
|
| 114 |
-
""" post-processing """
|
| 115 |
-
tempo = calc_tempo(path_audio)
|
| 116 |
-
refined_fl_note = refine_note(fl_note, tempo) # frame-level pitch score
|
| 117 |
-
return refined_fl_note.astype(int)
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
# %%
|
| 121 |
-
if __name__ == "__main__":
|
| 122 |
-
PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
|
| 123 |
-
parser = argparse.ArgumentParser(description="Predict singing transcription")
|
| 124 |
-
parser.add_argument(
|
| 125 |
-
"-i",
|
| 126 |
-
"--path_audio",
|
| 127 |
-
type=str,
|
| 128 |
-
help="Path to input audio file.",
|
| 129 |
-
default=f"{PATH_PROJECT}/audio/pop1.wav",
|
| 130 |
-
)
|
| 131 |
-
parser.add_argument(
|
| 132 |
-
"-o",
|
| 133 |
-
"--path_save",
|
| 134 |
-
type=str,
|
| 135 |
-
help="Path to folder for saving .mid file",
|
| 136 |
-
default=f"{PATH_PROJECT}/output",
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
parser.add_argument(
|
| 140 |
-
"-ot",
|
| 141 |
-
"--output_type",
|
| 142 |
-
type=str,
|
| 143 |
-
help="(optional) Output type: midi or frame-level pitch score(fps)",
|
| 144 |
-
default="midi",
|
| 145 |
-
)
|
| 146 |
-
|
| 147 |
-
main(parser.parse_args())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
icassp2022_vocal_transcription/src/utils.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import numpy as np
|
| 3 |
-
from pydub import AudioSegment
|
| 4 |
-
import pathlib
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
def check_and_make_dir(path_dir):
|
| 8 |
-
if not os.path.exists(os.path.dirname(path_dir)):
|
| 9 |
-
os.makedirs(os.path.dirname(path_dir))
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def get_filename_wo_extension(path_dir):
|
| 13 |
-
return pathlib.Path(path_dir).stem
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def note2pitch(pitch):
|
| 17 |
-
""" Convert MIDI number to freq.
|
| 18 |
-
----------
|
| 19 |
-
Parameters:
|
| 20 |
-
pitch: MIDI note numbers of pitch (array)
|
| 21 |
-
|
| 22 |
-
----------
|
| 23 |
-
Returns:
|
| 24 |
-
pitch: freqeuncy of pitch (array)
|
| 25 |
-
"""
|
| 26 |
-
|
| 27 |
-
pitch = np.array(pitch)
|
| 28 |
-
pitch[pitch > 0] = 2 ** ((pitch[pitch > 0] - 69) / 12.0) * 440
|
| 29 |
-
return pitch
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def pitch2note(pitch):
|
| 33 |
-
""" Convert freq to MIDI number
|
| 34 |
-
----------
|
| 35 |
-
Parameters:
|
| 36 |
-
pitch: freqeuncy of pitch (array)
|
| 37 |
-
|
| 38 |
-
----------
|
| 39 |
-
Returns:
|
| 40 |
-
pitch: MIDI note numbers of pitch (array)
|
| 41 |
-
"""
|
| 42 |
-
pitch = np.array(pitch)
|
| 43 |
-
pitch[pitch > 0] = np.round((69.0 + 12.0 * np.log2(pitch[pitch > 0] / 440.0)))
|
| 44 |
-
return pitch
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
a = np.array([0, 0, 0, 1, 2, 3, 5, 0, 0, 0, 1, 2, 4, 5])
|
| 48 |
-
b = a[a > 0] * 2
|
| 49 |
-
print(b)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
infer_tool.py
CHANGED
|
@@ -1,100 +1,175 @@
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
-
import soundfile
|
| 5 |
import torch
|
| 6 |
import torchaudio
|
| 7 |
-
from pydub import AudioSegment
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
-
def
|
| 13 |
-
source
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
source = torchaudio.functional.resample(source, sr, 16000)
|
|
|
|
|
|
|
| 15 |
source = source.unsqueeze(0).to(dev)
|
| 16 |
with torch.inference_mode():
|
| 17 |
units = hubert_soft.units(source)
|
| 18 |
return units
|
| 19 |
|
| 20 |
|
| 21 |
-
def transcribe(
|
| 22 |
-
feature_pit = feature_input.compute_f0(
|
| 23 |
feature_pit = feature_pit * 2 ** (transform / 12)
|
| 24 |
feature_pit = resize2d_f0(feature_pit, length)
|
| 25 |
coarse_pit = feature_input.coarse_f0(feature_pit)
|
| 26 |
return coarse_pit
|
| 27 |
|
| 28 |
|
| 29 |
-
def
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
def infer(file_name, speaker_id, tran, target_sample, net_g_ms, hubert_soft, feature_input):
|
| 44 |
-
source_path = "./wav_temp/input/" + file_name
|
| 45 |
-
audio, sample_rate = torchaudio.load(source_path)
|
| 46 |
-
input_size = audio.shape[-1]
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
sid = torch.LongTensor([int(speaker_id)]).to(dev)
|
| 49 |
-
soft =
|
| 50 |
-
pitch =
|
| 51 |
-
pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
|
| 52 |
stn_tst = torch.FloatTensor(soft)
|
| 53 |
with torch.no_grad():
|
| 54 |
x_tst = stn_tst.unsqueeze(0).to(dev)
|
| 55 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
|
| 56 |
audio = \
|
| 57 |
-
net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale
|
| 58 |
length_scale=1)[0][
|
| 59 |
0, 0].data.float().cpu().numpy()
|
| 60 |
-
|
| 61 |
-
int(audio.shape[0] / input_size * target_sample))
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def resize2d_f0(x, target_len):
|
| 65 |
-
source = np.array(x)
|
| 66 |
-
source[source < 0.001] = np.nan
|
| 67 |
-
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
|
| 68 |
-
source)
|
| 69 |
-
res = np.nan_to_num(target)
|
| 70 |
-
return res
|
| 71 |
-
|
| 72 |
|
| 73 |
-
# python删除文件的方法 os.remove(path)path指的是文件的绝对路径,如:
|
| 74 |
-
def del_file(path_data):
|
| 75 |
-
for i in os.listdir(path_data): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
|
| 76 |
-
os.remove(path_data + i)
|
| 77 |
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
def cut(c_time, file_path, vocal_name, out_dir):
|
| 80 |
-
audio_segment = AudioSegment.from_file(file_path, format='wav')
|
| 81 |
|
| 82 |
-
|
| 83 |
-
for i in range(total):
|
| 84 |
-
# 将音频10s切片,并以顺序进行命名
|
| 85 |
-
audio_segment[i * c_time * 1000:(i + 1) * c_time * 1000].export(f"{out_dir}/{vocal_name}-{i}.wav",
|
| 86 |
-
format="wav")
|
| 87 |
-
audio_segment[total * c_time * 1000:].export(f"{out_dir}/{vocal_name}-{total}.wav", format="wav") # 缺少结尾的音频片段
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def wav_resample(audio_path, tar_sample):
|
| 91 |
raw_audio, raw_sample_rate = torchaudio.load(audio_path)
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
def fill_a_to_b(a, b):
|
| 98 |
if len(a) < len(b):
|
| 99 |
for _ in range(0, len(b) - len(a)):
|
| 100 |
a.append(a[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
import os
|
| 3 |
+
import time
|
| 4 |
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
import numpy as np
|
|
|
|
| 7 |
import torch
|
| 8 |
import torchaudio
|
|
|
|
| 9 |
|
| 10 |
+
import hubert_model
|
| 11 |
+
import utils
|
| 12 |
+
from models import SynthesizerTrn
|
| 13 |
+
from preprocess_wave import FeatureInput
|
| 14 |
+
|
| 15 |
+
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
| 16 |
+
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def timeit(func):
|
| 20 |
+
def run(*args, **kwargs):
|
| 21 |
+
t = time.time()
|
| 22 |
+
res = func(*args, **kwargs)
|
| 23 |
+
print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
|
| 24 |
+
return res
|
| 25 |
+
|
| 26 |
+
return run
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_end_file(dir_path, end):
|
| 30 |
+
file_lists = []
|
| 31 |
+
for root, dirs, files in os.walk(dir_path):
|
| 32 |
+
files = [f for f in files if f[0] != '.']
|
| 33 |
+
dirs[:] = [d for d in dirs if d[0] != '.']
|
| 34 |
+
for f_file in files:
|
| 35 |
+
if f_file.endswith(end):
|
| 36 |
+
file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
|
| 37 |
+
return file_lists
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def load_model(model_path, config_path):
|
| 41 |
+
# 获取模型配置
|
| 42 |
+
hps_ms = utils.get_hparams_from_file(config_path)
|
| 43 |
+
n_g_ms = SynthesizerTrn(
|
| 44 |
+
178,
|
| 45 |
+
hps_ms.data.filter_length // 2 + 1,
|
| 46 |
+
hps_ms.train.segment_size // hps_ms.data.hop_length,
|
| 47 |
+
n_speakers=hps_ms.data.n_speakers,
|
| 48 |
+
**hps_ms.model)
|
| 49 |
+
_ = utils.load_checkpoint(model_path, n_g_ms, None)
|
| 50 |
+
_ = n_g_ms.eval().to(dev)
|
| 51 |
+
# 加载hubert
|
| 52 |
+
hubert_soft = hubert_model.hubert_soft(get_end_file("./", "pt")[0])
|
| 53 |
+
feature_input = FeatureInput(hps_ms.data.sampling_rate, hps_ms.data.hop_length)
|
| 54 |
+
return n_g_ms, hubert_soft, feature_input, hps_ms
|
| 55 |
|
| 56 |
|
| 57 |
+
def resize2d_f0(x, target_len):
|
| 58 |
+
source = np.array(x)
|
| 59 |
+
source[source < 0.001] = np.nan
|
| 60 |
+
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
|
| 61 |
+
source)
|
| 62 |
+
res = np.nan_to_num(target)
|
| 63 |
+
return res
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_units(in_path, hubert_soft):
|
| 67 |
+
source, sr = torchaudio.load(in_path)
|
| 68 |
source = torchaudio.functional.resample(source, sr, 16000)
|
| 69 |
+
if len(source.shape) == 2 and source.shape[1] >= 2:
|
| 70 |
+
source = torch.mean(source, dim=0).unsqueeze(0)
|
| 71 |
source = source.unsqueeze(0).to(dev)
|
| 72 |
with torch.inference_mode():
|
| 73 |
units = hubert_soft.units(source)
|
| 74 |
return units
|
| 75 |
|
| 76 |
|
| 77 |
+
def transcribe(source_path, length, transform, feature_input):
|
| 78 |
+
feature_pit = feature_input.compute_f0(source_path)
|
| 79 |
feature_pit = feature_pit * 2 ** (transform / 12)
|
| 80 |
feature_pit = resize2d_f0(feature_pit, length)
|
| 81 |
coarse_pit = feature_input.coarse_f0(feature_pit)
|
| 82 |
return coarse_pit
|
| 83 |
|
| 84 |
|
| 85 |
+
def get_unit_pitch(in_path, tran, hubert_soft, feature_input):
|
| 86 |
+
soft = get_units(in_path, hubert_soft).squeeze(0).cpu().numpy()
|
| 87 |
+
input_pitch = transcribe(in_path, soft.shape[0], tran, feature_input)
|
| 88 |
+
return soft, input_pitch
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def clean_pitch(input_pitch):
|
| 92 |
+
num_nan = np.sum(input_pitch == 1)
|
| 93 |
+
if num_nan / len(input_pitch) > 0.9:
|
| 94 |
+
input_pitch[input_pitch != 1] = 1
|
| 95 |
+
return input_pitch
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def plt_pitch(input_pitch):
|
| 99 |
+
input_pitch = input_pitch.astype(float)
|
| 100 |
+
input_pitch[input_pitch == 1] = np.nan
|
| 101 |
+
return input_pitch
|
| 102 |
+
|
| 103 |
|
| 104 |
+
def f0_to_pitch(ff):
|
| 105 |
+
f0_pitch = 69 + 12 * np.log2(ff / 440)
|
| 106 |
+
return f0_pitch
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
+
def f0_plt(in_path, out_path, tran, hubert_soft, feature_input):
|
| 110 |
+
s1, input_pitch = get_unit_pitch(in_path, tran, hubert_soft, feature_input)
|
| 111 |
+
s2, output_pitch = get_unit_pitch(out_path, 0, hubert_soft, feature_input)
|
| 112 |
+
plt.clf()
|
| 113 |
+
plt.plot(plt_pitch(input_pitch), color="#66ccff")
|
| 114 |
+
plt.plot(plt_pitch(output_pitch), color="orange")
|
| 115 |
+
plt.savefig("temp.jpg")
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def calc_error(in_path, out_path, tran, feature_input):
|
| 119 |
+
input_pitch = feature_input.compute_f0(in_path)
|
| 120 |
+
output_pitch = feature_input.compute_f0(out_path)
|
| 121 |
+
sum_y = []
|
| 122 |
+
if np.sum(input_pitch == 0) / len(input_pitch) > 0.9:
|
| 123 |
+
mistake, var_take = 0, 0
|
| 124 |
+
else:
|
| 125 |
+
for i in range(min(len(input_pitch), len(output_pitch))):
|
| 126 |
+
if input_pitch[i] > 0 and output_pitch[i] > 0:
|
| 127 |
+
sum_y.append(abs(f0_to_pitch(output_pitch[i]) - (f0_to_pitch(input_pitch[i]) + tran)))
|
| 128 |
+
num_y = 0
|
| 129 |
+
for x in sum_y:
|
| 130 |
+
num_y += x
|
| 131 |
+
len_y = len(sum_y) if len(sum_y) else 1
|
| 132 |
+
mistake = round(float(num_y / len_y), 2)
|
| 133 |
+
var_take = round(float(np.std(sum_y, ddof=1)), 2)
|
| 134 |
+
return mistake, var_take
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def infer(source_path, speaker_id, tran, net_g_ms, hubert_soft, feature_input):
|
| 138 |
sid = torch.LongTensor([int(speaker_id)]).to(dev)
|
| 139 |
+
soft, pitch = get_unit_pitch(source_path, tran, hubert_soft, feature_input)
|
| 140 |
+
pitch = torch.LongTensor(clean_pitch(pitch)).unsqueeze(0).to(dev)
|
|
|
|
| 141 |
stn_tst = torch.FloatTensor(soft)
|
| 142 |
with torch.no_grad():
|
| 143 |
x_tst = stn_tst.unsqueeze(0).to(dev)
|
| 144 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
|
| 145 |
audio = \
|
| 146 |
+
net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=0.3, noise_scale_w=0.5,
|
| 147 |
length_scale=1)[0][
|
| 148 |
0, 0].data.float().cpu().numpy()
|
| 149 |
+
return audio, audio.shape[-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
+
def del_temp_wav(path_data):
|
| 153 |
+
for i in get_end_file(path_data, "wav"): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
|
| 154 |
+
os.remove(i)
|
| 155 |
|
|
|
|
|
|
|
| 156 |
|
| 157 |
+
def format_wav(audio_path, tar_sample):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
raw_audio, raw_sample_rate = torchaudio.load(audio_path)
|
| 159 |
+
if len(raw_audio.shape) == 2 and raw_audio.shape[1] >= 2:
|
| 160 |
+
raw_audio = torch.mean(raw_audio, dim=0).unsqueeze(0)
|
| 161 |
+
tar_audio = torchaudio.functional.resample(raw_audio, raw_sample_rate, tar_sample)
|
| 162 |
+
torchaudio.save(audio_path[:-4] + ".wav", tar_audio, tar_sample)
|
| 163 |
+
return tar_audio, tar_sample
|
| 164 |
|
| 165 |
|
| 166 |
def fill_a_to_b(a, b):
|
| 167 |
if len(a) < len(b):
|
| 168 |
for _ in range(0, len(b) - len(a)):
|
| 169 |
a.append(a[0])
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def mkdir(paths: list):
|
| 173 |
+
for path in paths:
|
| 174 |
+
if not os.path.exists(path):
|
| 175 |
+
os.mkdir(path)
|
models.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
| 1 |
-
import copy
|
| 2 |
import math
|
|
|
|
|
|
|
| 3 |
import torch
|
| 4 |
from torch import nn
|
|
|
|
| 5 |
from torch.nn import functional as F
|
| 6 |
-
import
|
|
|
|
|
|
|
| 7 |
import commons
|
| 8 |
import modules
|
| 9 |
-
import attentions
|
| 10 |
-
|
| 11 |
-
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
| 12 |
-
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
| 13 |
from commons import init_weights, get_padding
|
| 14 |
|
| 15 |
|
|
@@ -189,7 +189,7 @@ class TextEncoder(nn.Module):
|
|
| 189 |
|
| 190 |
# self.emb = nn.Embedding(n_vocab, hidden_channels)
|
| 191 |
# nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
|
| 192 |
-
self.emb_pitch = nn.Embedding(
|
| 193 |
nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5)
|
| 194 |
|
| 195 |
self.encoder = attentions.Encoder(
|
|
@@ -491,8 +491,8 @@ class SynthesizerTrn(nn.Module):
|
|
| 491 |
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
|
| 492 |
gin_channels=gin_channels)
|
| 493 |
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
| 494 |
-
self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers,
|
| 495 |
-
|
| 496 |
|
| 497 |
if use_sdp:
|
| 498 |
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
|
@@ -504,12 +504,6 @@ class SynthesizerTrn(nn.Module):
|
|
| 504 |
|
| 505 |
def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
| 506 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
|
| 507 |
-
pred_pitch, pitch_embedding = self.pitch_net(x, x_mask)
|
| 508 |
-
x = x + pitch_embedding
|
| 509 |
-
# print(pred_pitch)
|
| 510 |
-
gt_lf0 = torch.log(440 * (2 ** ((pitch - 69) / 12)))
|
| 511 |
-
|
| 512 |
-
# print(gt_lf0)
|
| 513 |
if self.n_speakers > 0:
|
| 514 |
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
| 515 |
else:
|
|
|
|
|
|
|
| 1 |
import math
|
| 2 |
+
import math
|
| 3 |
+
|
| 4 |
import torch
|
| 5 |
from torch import nn
|
| 6 |
+
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
| 7 |
from torch.nn import functional as F
|
| 8 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
| 9 |
+
|
| 10 |
+
import attentions
|
| 11 |
import commons
|
| 12 |
import modules
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
from commons import init_weights, get_padding
|
| 14 |
|
| 15 |
|
|
|
|
| 189 |
|
| 190 |
# self.emb = nn.Embedding(n_vocab, hidden_channels)
|
| 191 |
# nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
|
| 192 |
+
self.emb_pitch = nn.Embedding(256, hidden_channels)
|
| 193 |
nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5)
|
| 194 |
|
| 195 |
self.encoder = attentions.Encoder(
|
|
|
|
| 491 |
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
|
| 492 |
gin_channels=gin_channels)
|
| 493 |
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
| 494 |
+
# self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers,
|
| 495 |
+
# kernel_size, p_dropout)
|
| 496 |
|
| 497 |
if use_sdp:
|
| 498 |
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
|
|
|
| 504 |
|
| 505 |
def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
| 506 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
if self.n_speakers > 0:
|
| 508 |
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
| 509 |
else:
|
modules.py
CHANGED
|
@@ -1,187 +1,184 @@
|
|
| 1 |
-
import copy
|
| 2 |
import math
|
| 3 |
-
|
| 4 |
-
import scipy
|
| 5 |
import torch
|
| 6 |
from torch import nn
|
| 7 |
-
from torch.nn import
|
| 8 |
-
|
| 9 |
-
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
| 10 |
from torch.nn.utils import weight_norm, remove_weight_norm
|
| 11 |
|
| 12 |
import commons
|
| 13 |
from commons import init_weights, get_padding
|
| 14 |
from transforms import piecewise_rational_quadratic_transform
|
| 15 |
|
| 16 |
-
|
| 17 |
LRELU_SLOPE = 0.1
|
| 18 |
|
| 19 |
|
| 20 |
class LayerNorm(nn.Module):
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
self
|
| 27 |
-
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
def forward(self, x):
|
| 30 |
-
x = x.transpose(1, -1)
|
| 31 |
-
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
| 32 |
-
return x.transpose(1, -1)
|
| 33 |
|
| 34 |
-
|
| 35 |
class ConvReluNorm(nn.Module):
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
|
| 69 |
|
| 70 |
class DDSConv(nn.Module):
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
| 109 |
|
| 110 |
|
| 111 |
class WN(torch.nn.Module):
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
|
| 186 |
|
| 187 |
class ResBlock1(torch.nn.Module):
|
|
@@ -209,11 +206,11 @@ class ResBlock1(torch.nn.Module):
|
|
| 209 |
|
| 210 |
def forward(self, x, x_mask=None):
|
| 211 |
for c1, c2 in zip(self.convs1, self.convs2):
|
| 212 |
-
xt =
|
| 213 |
if x_mask is not None:
|
| 214 |
xt = xt * x_mask
|
| 215 |
xt = c1(xt)
|
| 216 |
-
xt =
|
| 217 |
if x_mask is not None:
|
| 218 |
xt = xt * x_mask
|
| 219 |
xt = c2(xt)
|
|
@@ -242,7 +239,7 @@ class ResBlock2(torch.nn.Module):
|
|
| 242 |
|
| 243 |
def forward(self, x, x_mask=None):
|
| 244 |
for c in self.convs:
|
| 245 |
-
xt =
|
| 246 |
if x_mask is not None:
|
| 247 |
xt = xt * x_mask
|
| 248 |
xt = c(xt)
|
|
@@ -257,134 +254,135 @@ class ResBlock2(torch.nn.Module):
|
|
| 257 |
|
| 258 |
|
| 259 |
class Log(nn.Module):
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
|
| 270 |
class Flip(nn.Module):
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
|
| 279 |
|
| 280 |
class ElementwiseAffine(nn.Module):
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
|
| 297 |
|
| 298 |
class ResidualCouplingLayer(nn.Module):
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
|
|
|
| 344 |
|
| 345 |
|
| 346 |
class ConvFlow(nn.Module):
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
|
|
|
|
|
|
| 1 |
import math
|
| 2 |
+
|
|
|
|
| 3 |
import torch
|
| 4 |
from torch import nn
|
| 5 |
+
from torch.nn import Conv1d
|
| 6 |
+
from torch.nn import functional as t_func
|
|
|
|
| 7 |
from torch.nn.utils import weight_norm, remove_weight_norm
|
| 8 |
|
| 9 |
import commons
|
| 10 |
from commons import init_weights, get_padding
|
| 11 |
from transforms import piecewise_rational_quadratic_transform
|
| 12 |
|
|
|
|
| 13 |
LRELU_SLOPE = 0.1
|
| 14 |
|
| 15 |
|
| 16 |
class LayerNorm(nn.Module):
|
| 17 |
+
def __init__(self, channels, eps=1e-5):
|
| 18 |
+
super().__init__()
|
| 19 |
+
self.channels = channels
|
| 20 |
+
self.eps = eps
|
| 21 |
+
|
| 22 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
| 23 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
| 24 |
|
| 25 |
+
def forward(self, x):
|
| 26 |
+
x = x.transpose(1, -1)
|
| 27 |
+
x = t_func.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
| 28 |
+
return x.transpose(1, -1)
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
|
|
|
| 31 |
class ConvReluNorm(nn.Module):
|
| 32 |
+
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
| 33 |
+
super().__init__()
|
| 34 |
+
self.in_channels = in_channels
|
| 35 |
+
self.hidden_channels = hidden_channels
|
| 36 |
+
self.out_channels = out_channels
|
| 37 |
+
self.kernel_size = kernel_size
|
| 38 |
+
self.n_layers = n_layers
|
| 39 |
+
self.p_dropout = p_dropout
|
| 40 |
+
assert n_layers > 1, "Number of layers should be larger than 0."
|
| 41 |
+
|
| 42 |
+
self.conv_layers = nn.ModuleList()
|
| 43 |
+
self.norm_layers = nn.ModuleList()
|
| 44 |
+
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
| 45 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
| 46 |
+
self.relu_drop = nn.Sequential(
|
| 47 |
+
nn.ReLU(),
|
| 48 |
+
nn.Dropout(p_dropout))
|
| 49 |
+
for _ in range(n_layers - 1):
|
| 50 |
+
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
| 51 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
| 52 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
| 53 |
+
self.proj.weight.data.zero_()
|
| 54 |
+
self.proj.bias.data.zero_()
|
| 55 |
+
|
| 56 |
+
def forward(self, x, x_mask):
|
| 57 |
+
x_org = x
|
| 58 |
+
for i in range(self.n_layers):
|
| 59 |
+
x = self.conv_layers[i](x * x_mask)
|
| 60 |
+
x = self.norm_layers[i](x)
|
| 61 |
+
x = self.relu_drop(x)
|
| 62 |
+
x = x_org + self.proj(x)
|
| 63 |
+
return x * x_mask
|
| 64 |
|
| 65 |
|
| 66 |
class DDSConv(nn.Module):
|
| 67 |
+
"""
|
| 68 |
+
Dialted and Depth-Separable Convolution
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
|
| 72 |
+
super().__init__()
|
| 73 |
+
self.channels = channels
|
| 74 |
+
self.kernel_size = kernel_size
|
| 75 |
+
self.n_layers = n_layers
|
| 76 |
+
self.p_dropout = p_dropout
|
| 77 |
+
|
| 78 |
+
self.drop = nn.Dropout(p_dropout)
|
| 79 |
+
self.convs_sep = nn.ModuleList()
|
| 80 |
+
self.convs_1x1 = nn.ModuleList()
|
| 81 |
+
self.norms_1 = nn.ModuleList()
|
| 82 |
+
self.norms_2 = nn.ModuleList()
|
| 83 |
+
for i in range(n_layers):
|
| 84 |
+
dilation = kernel_size ** i
|
| 85 |
+
padding = (kernel_size * dilation - dilation) // 2
|
| 86 |
+
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
|
| 87 |
+
groups=channels, dilation=dilation, padding=padding
|
| 88 |
+
))
|
| 89 |
+
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
| 90 |
+
self.norms_1.append(LayerNorm(channels))
|
| 91 |
+
self.norms_2.append(LayerNorm(channels))
|
| 92 |
+
|
| 93 |
+
def forward(self, x, x_mask, g=None):
|
| 94 |
+
if g is not None:
|
| 95 |
+
x = x + g
|
| 96 |
+
for i in range(self.n_layers):
|
| 97 |
+
y = self.convs_sep[i](x * x_mask)
|
| 98 |
+
y = self.norms_1[i](y)
|
| 99 |
+
y = t_func.gelu(y)
|
| 100 |
+
y = self.convs_1x1[i](y)
|
| 101 |
+
y = self.norms_2[i](y)
|
| 102 |
+
y = t_func.gelu(y)
|
| 103 |
+
y = self.drop(y)
|
| 104 |
+
x = x + y
|
| 105 |
+
return x * x_mask
|
| 106 |
|
| 107 |
|
| 108 |
class WN(torch.nn.Module):
|
| 109 |
+
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
|
| 110 |
+
super(WN, self).__init__()
|
| 111 |
+
assert (kernel_size % 2 == 1)
|
| 112 |
+
self.hidden_channels = hidden_channels
|
| 113 |
+
self.kernel_size = kernel_size,
|
| 114 |
+
self.dilation_rate = dilation_rate
|
| 115 |
+
self.n_layers = n_layers
|
| 116 |
+
self.gin_channels = gin_channels
|
| 117 |
+
self.p_dropout = p_dropout
|
| 118 |
+
|
| 119 |
+
self.in_layers = torch.nn.ModuleList()
|
| 120 |
+
self.res_skip_layers = torch.nn.ModuleList()
|
| 121 |
+
self.drop = nn.Dropout(p_dropout)
|
| 122 |
+
|
| 123 |
+
if gin_channels != 0:
|
| 124 |
+
cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
|
| 125 |
+
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
|
| 126 |
+
|
| 127 |
+
for i in range(n_layers):
|
| 128 |
+
dilation = dilation_rate ** i
|
| 129 |
+
padding = int((kernel_size * dilation - dilation) / 2)
|
| 130 |
+
in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
|
| 131 |
+
dilation=dilation, padding=padding)
|
| 132 |
+
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
| 133 |
+
self.in_layers.append(in_layer)
|
| 134 |
+
|
| 135 |
+
# last one is not necessary
|
| 136 |
+
if i < n_layers - 1:
|
| 137 |
+
res_skip_channels = 2 * hidden_channels
|
| 138 |
+
else:
|
| 139 |
+
res_skip_channels = hidden_channels
|
| 140 |
+
|
| 141 |
+
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
| 142 |
+
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
|
| 143 |
+
self.res_skip_layers.append(res_skip_layer)
|
| 144 |
+
|
| 145 |
+
def forward(self, x, x_mask, g=None, **kwargs):
|
| 146 |
+
output = torch.zeros_like(x)
|
| 147 |
+
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
| 148 |
+
|
| 149 |
+
if g is not None:
|
| 150 |
+
g = self.cond_layer(g)
|
| 151 |
+
|
| 152 |
+
for i in range(self.n_layers):
|
| 153 |
+
x_in = self.in_layers[i](x)
|
| 154 |
+
if g is not None:
|
| 155 |
+
cond_offset = i * 2 * self.hidden_channels
|
| 156 |
+
g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
|
| 157 |
+
else:
|
| 158 |
+
g_l = torch.zeros_like(x_in)
|
| 159 |
+
|
| 160 |
+
acts = commons.fused_add_tanh_sigmoid_multiply(
|
| 161 |
+
x_in,
|
| 162 |
+
g_l,
|
| 163 |
+
n_channels_tensor)
|
| 164 |
+
acts = self.drop(acts)
|
| 165 |
+
|
| 166 |
+
res_skip_acts = self.res_skip_layers[i](acts)
|
| 167 |
+
if i < self.n_layers - 1:
|
| 168 |
+
res_acts = res_skip_acts[:, :self.hidden_channels, :]
|
| 169 |
+
x = (x + res_acts) * x_mask
|
| 170 |
+
output = output + res_skip_acts[:, self.hidden_channels:, :]
|
| 171 |
+
else:
|
| 172 |
+
output = output + res_skip_acts
|
| 173 |
+
return output * x_mask
|
| 174 |
+
|
| 175 |
+
def remove_weight_norm(self):
|
| 176 |
+
if self.gin_channels != 0:
|
| 177 |
+
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
| 178 |
+
for l in self.in_layers:
|
| 179 |
+
torch.nn.utils.remove_weight_norm(l)
|
| 180 |
+
for l in self.res_skip_layers:
|
| 181 |
+
torch.nn.utils.remove_weight_norm(l)
|
| 182 |
|
| 183 |
|
| 184 |
class ResBlock1(torch.nn.Module):
|
|
|
|
| 206 |
|
| 207 |
def forward(self, x, x_mask=None):
|
| 208 |
for c1, c2 in zip(self.convs1, self.convs2):
|
| 209 |
+
xt = t_func.leaky_relu(x, LRELU_SLOPE)
|
| 210 |
if x_mask is not None:
|
| 211 |
xt = xt * x_mask
|
| 212 |
xt = c1(xt)
|
| 213 |
+
xt = t_func.leaky_relu(xt, LRELU_SLOPE)
|
| 214 |
if x_mask is not None:
|
| 215 |
xt = xt * x_mask
|
| 216 |
xt = c2(xt)
|
|
|
|
| 239 |
|
| 240 |
def forward(self, x, x_mask=None):
|
| 241 |
for c in self.convs:
|
| 242 |
+
xt = t_func.leaky_relu(x, LRELU_SLOPE)
|
| 243 |
if x_mask is not None:
|
| 244 |
xt = xt * x_mask
|
| 245 |
xt = c(xt)
|
|
|
|
| 254 |
|
| 255 |
|
| 256 |
class Log(nn.Module):
|
| 257 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
| 258 |
+
if not reverse:
|
| 259 |
+
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
| 260 |
+
logdet = torch.sum(-y, [1, 2])
|
| 261 |
+
return y, logdet
|
| 262 |
+
else:
|
| 263 |
+
x = torch.exp(x) * x_mask
|
| 264 |
+
return x
|
| 265 |
+
|
| 266 |
|
| 267 |
class Flip(nn.Module):
|
| 268 |
+
def forward(self, x, *args, reverse=False, **kwargs):
|
| 269 |
+
x = torch.flip(x, [1])
|
| 270 |
+
if not reverse:
|
| 271 |
+
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
| 272 |
+
return x, logdet
|
| 273 |
+
else:
|
| 274 |
+
return x
|
| 275 |
|
| 276 |
|
| 277 |
class ElementwiseAffine(nn.Module):
|
| 278 |
+
def __init__(self, channels):
|
| 279 |
+
super().__init__()
|
| 280 |
+
self.channels = channels
|
| 281 |
+
self.m = nn.Parameter(torch.zeros(channels, 1))
|
| 282 |
+
self.logs = nn.Parameter(torch.zeros(channels, 1))
|
| 283 |
+
|
| 284 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
| 285 |
+
if not reverse:
|
| 286 |
+
y = self.m + torch.exp(self.logs) * x
|
| 287 |
+
y = y * x_mask
|
| 288 |
+
logdet = torch.sum(self.logs * x_mask, [1, 2])
|
| 289 |
+
return y, logdet
|
| 290 |
+
else:
|
| 291 |
+
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
| 292 |
+
return x
|
| 293 |
|
| 294 |
|
| 295 |
class ResidualCouplingLayer(nn.Module):
|
| 296 |
+
def __init__(self,
|
| 297 |
+
channels,
|
| 298 |
+
hidden_channels,
|
| 299 |
+
kernel_size,
|
| 300 |
+
dilation_rate,
|
| 301 |
+
n_layers,
|
| 302 |
+
p_dropout=0,
|
| 303 |
+
gin_channels=0,
|
| 304 |
+
mean_only=False):
|
| 305 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
| 306 |
+
super().__init__()
|
| 307 |
+
self.channels = channels
|
| 308 |
+
self.hidden_channels = hidden_channels
|
| 309 |
+
self.kernel_size = kernel_size
|
| 310 |
+
self.dilation_rate = dilation_rate
|
| 311 |
+
self.n_layers = n_layers
|
| 312 |
+
self.half_channels = channels // 2
|
| 313 |
+
self.mean_only = mean_only
|
| 314 |
+
|
| 315 |
+
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
| 316 |
+
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
|
| 317 |
+
gin_channels=gin_channels)
|
| 318 |
+
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
| 319 |
+
self.post.weight.data.zero_()
|
| 320 |
+
self.post.bias.data.zero_()
|
| 321 |
+
|
| 322 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
| 323 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
| 324 |
+
h = self.pre(x0) * x_mask
|
| 325 |
+
h = self.enc(h, x_mask, g=g)
|
| 326 |
+
stats = self.post(h) * x_mask
|
| 327 |
+
if not self.mean_only:
|
| 328 |
+
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
| 329 |
+
else:
|
| 330 |
+
m = stats
|
| 331 |
+
logs = torch.zeros_like(m)
|
| 332 |
+
|
| 333 |
+
if not reverse:
|
| 334 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
| 335 |
+
x = torch.cat([x0, x1], 1)
|
| 336 |
+
logdet = torch.sum(logs, [1, 2])
|
| 337 |
+
return x, logdet
|
| 338 |
+
else:
|
| 339 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
| 340 |
+
x = torch.cat([x0, x1], 1)
|
| 341 |
+
return x
|
| 342 |
|
| 343 |
|
| 344 |
class ConvFlow(nn.Module):
|
| 345 |
+
def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
|
| 346 |
+
super().__init__()
|
| 347 |
+
self.in_channels = in_channels
|
| 348 |
+
self.filter_channels = filter_channels
|
| 349 |
+
self.kernel_size = kernel_size
|
| 350 |
+
self.n_layers = n_layers
|
| 351 |
+
self.num_bins = num_bins
|
| 352 |
+
self.tail_bound = tail_bound
|
| 353 |
+
self.half_channels = in_channels // 2
|
| 354 |
+
|
| 355 |
+
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
|
| 356 |
+
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
|
| 357 |
+
self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
|
| 358 |
+
self.proj.weight.data.zero_()
|
| 359 |
+
self.proj.bias.data.zero_()
|
| 360 |
+
|
| 361 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
| 362 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
| 363 |
+
h = self.pre(x0)
|
| 364 |
+
h = self.convs(h, x_mask, g=g)
|
| 365 |
+
h = self.proj(h) * x_mask
|
| 366 |
+
|
| 367 |
+
b, c, t = x0.shape
|
| 368 |
+
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
|
| 369 |
+
|
| 370 |
+
unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
|
| 371 |
+
unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
|
| 372 |
+
unnormalized_derivatives = h[..., 2 * self.num_bins:]
|
| 373 |
+
|
| 374 |
+
x1, logabsdet = piecewise_rational_quadratic_transform(x1,
|
| 375 |
+
unnormalized_widths,
|
| 376 |
+
unnormalized_heights,
|
| 377 |
+
unnormalized_derivatives,
|
| 378 |
+
inverse=reverse,
|
| 379 |
+
tails='linear',
|
| 380 |
+
tail_bound=self.tail_bound
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
x = torch.cat([x0, x1], 1) * x_mask
|
| 384 |
+
logdet = torch.sum(logabsdet * x_mask, [1, 2])
|
| 385 |
+
if not reverse:
|
| 386 |
+
return x, logdet
|
| 387 |
+
else:
|
| 388 |
+
return x
|
preprocess_wave.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import librosa
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pyworld
|
| 6 |
+
from scipy.io import wavfile
|
| 7 |
+
|
| 8 |
+
import utils
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class FeatureInput(object):
|
| 12 |
+
def __init__(self, samplerate=16000, hop_size=160):
|
| 13 |
+
self.fs = samplerate
|
| 14 |
+
self.hop = hop_size
|
| 15 |
+
|
| 16 |
+
self.f0_bin = 256
|
| 17 |
+
self.f0_max = 1100.0
|
| 18 |
+
self.f0_min = 50.0
|
| 19 |
+
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
|
| 20 |
+
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
|
| 21 |
+
|
| 22 |
+
def compute_f0(self, path):
|
| 23 |
+
x, sr = librosa.load(path, sr=self.fs)
|
| 24 |
+
assert sr == self.fs
|
| 25 |
+
f0, t = pyworld.dio(
|
| 26 |
+
x.astype(np.double),
|
| 27 |
+
fs=sr,
|
| 28 |
+
f0_ceil=800,
|
| 29 |
+
frame_period=1000 * self.hop / sr,
|
| 30 |
+
)
|
| 31 |
+
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
|
| 32 |
+
for index, pitch in enumerate(f0):
|
| 33 |
+
f0[index] = round(pitch, 1)
|
| 34 |
+
return f0
|
| 35 |
+
|
| 36 |
+
# for numpy # code from diffsinger
|
| 37 |
+
def coarse_f0(self, f0):
|
| 38 |
+
f0_mel = 1127 * np.log(1 + f0 / 700)
|
| 39 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
|
| 40 |
+
self.f0_bin - 2
|
| 41 |
+
) / (self.f0_mel_max - self.f0_mel_min) + 1
|
| 42 |
+
|
| 43 |
+
# use 0 or 1
|
| 44 |
+
f0_mel[f0_mel <= 1] = 1
|
| 45 |
+
f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
|
| 46 |
+
f0_coarse = np.rint(f0_mel).astype(np.int)
|
| 47 |
+
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
|
| 48 |
+
f0_coarse.max(),
|
| 49 |
+
f0_coarse.min(),
|
| 50 |
+
)
|
| 51 |
+
return f0_coarse
|
| 52 |
+
|
| 53 |
+
# for tensor # code from diffsinger
|
| 54 |
+
def coarse_f0_ts(self, f0):
|
| 55 |
+
f0_mel = 1127 * (1 + f0 / 700).log()
|
| 56 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
|
| 57 |
+
self.f0_bin - 2
|
| 58 |
+
) / (self.f0_mel_max - self.f0_mel_min) + 1
|
| 59 |
+
|
| 60 |
+
# use 0 or 1
|
| 61 |
+
f0_mel[f0_mel <= 1] = 1
|
| 62 |
+
f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
|
| 63 |
+
f0_coarse = (f0_mel + 0.5).long()
|
| 64 |
+
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
|
| 65 |
+
f0_coarse.max(),
|
| 66 |
+
f0_coarse.min(),
|
| 67 |
+
)
|
| 68 |
+
return f0_coarse
|
| 69 |
+
|
| 70 |
+
def save_wav(self, wav, path):
|
| 71 |
+
wav *= 32767 / max(0.01, np.max(np.abs(wav))) * 0.6
|
| 72 |
+
wavfile.write(path, self.fs, wav.astype(np.int16))
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
wavPath = "./data/waves"
|
| 77 |
+
outPath = "./data/label"
|
| 78 |
+
if not os.path.exists("./data/label"):
|
| 79 |
+
os.mkdir("./data/label")
|
| 80 |
+
|
| 81 |
+
# define model and load checkpoint
|
| 82 |
+
hps = utils.get_hparams_from_file("./configs/singing_base.json")
|
| 83 |
+
featureInput = FeatureInput(hps.data.sampling_rate, hps.data.hop_length)
|
| 84 |
+
vits_file = open("./filelists/vc_file.txt", "w", encoding="utf-8")
|
| 85 |
+
|
| 86 |
+
for spks in os.listdir(wavPath):
|
| 87 |
+
if os.path.isdir(f"./{wavPath}/{spks}"):
|
| 88 |
+
os.makedirs(f"./{outPath}/{spks}")
|
| 89 |
+
for file in os.listdir(f"./{wavPath}/{spks}"):
|
| 90 |
+
if file.endswith(".wav"):
|
| 91 |
+
file = file[:-4]
|
| 92 |
+
audio_path = f"./{wavPath}/{spks}/{file}.wav"
|
| 93 |
+
featur_pit = featureInput.compute_f0(audio_path)
|
| 94 |
+
coarse_pit = featureInput.coarse_f0(featur_pit)
|
| 95 |
+
np.save(
|
| 96 |
+
f"{outPath}/{spks}/{file}_pitch.npy",
|
| 97 |
+
coarse_pit,
|
| 98 |
+
allow_pickle=False,
|
| 99 |
+
)
|
| 100 |
+
np.save(
|
| 101 |
+
f"{outPath}/{spks}/{file}_nsff0.npy",
|
| 102 |
+
featur_pit,
|
| 103 |
+
allow_pickle=False,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
path_audio = f"./data/waves/{spks}/{file}.wav"
|
| 107 |
+
path_spkid = f"./data/spkid/{spks}.npy"
|
| 108 |
+
path_label = (
|
| 109 |
+
f"./data/phone/{spks}/{file}.npy" # phone means ppg & hubert
|
| 110 |
+
)
|
| 111 |
+
path_pitch = f"./data/label/{spks}/{file}_pitch.npy"
|
| 112 |
+
path_nsff0 = f"./data/label/{spks}/{file}_nsff0.npy"
|
| 113 |
+
print(
|
| 114 |
+
f"{path_audio}|{path_spkid}|{path_label}|{path_pitch}|{path_nsff0}",
|
| 115 |
+
file=vits_file,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
vits_file.close()
|
requirements.txt
CHANGED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
Cython==0.29.21
|
| 2 |
librosa==0.8.0
|
| 3 |
-
matplotlib
|
| 4 |
-
|
| 5 |
-
|
|
|
|
| 6 |
torch
|
| 7 |
torchvision
|
| 8 |
-
Unidecode
|
| 9 |
torchaudio
|
| 10 |
pyworld
|
|
|
|
| 11 |
keras
|
| 12 |
mir-eval
|
| 13 |
pretty-midi
|
| 14 |
-
|
| 15 |
-
numpy
|
| 16 |
-
pydub
|
|
|
|
| 1 |
Cython==0.29.21
|
| 2 |
librosa==0.8.0
|
| 3 |
+
matplotlib==3.3.1
|
| 4 |
+
numpy==1.18.5
|
| 5 |
+
phonemizer==2.2.1
|
| 6 |
+
scipy==1.5.2
|
| 7 |
torch
|
| 8 |
torchvision
|
| 9 |
+
Unidecode==1.1.1
|
| 10 |
torchaudio
|
| 11 |
pyworld
|
| 12 |
+
scipy
|
| 13 |
keras
|
| 14 |
mir-eval
|
| 15 |
pretty-midi
|
| 16 |
+
pydub
|
|
|
|
|
|
text/LICENSE
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
Copyright (c) 2017 Keith Ito
|
| 2 |
-
|
| 3 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 4 |
-
of this software and associated documentation files (the "Software"), to deal
|
| 5 |
-
in the Software without restriction, including without limitation the rights
|
| 6 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 7 |
-
copies of the Software, and to permit persons to whom the Software is
|
| 8 |
-
furnished to do so, subject to the following conditions:
|
| 9 |
-
|
| 10 |
-
The above copyright notice and this permission notice shall be included in
|
| 11 |
-
all copies or substantial portions of the Software.
|
| 12 |
-
|
| 13 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 14 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 15 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 16 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 17 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 18 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 19 |
-
THE SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text/__init__.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
| 1 |
-
""" from https://github.com/keithito/tacotron """
|
| 2 |
-
from text import cleaners
|
| 3 |
-
from text.symbols import symbols
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
# Mappings from symbol to numeric ID and vice versa:
|
| 7 |
-
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
| 8 |
-
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def text_to_sequence(text, cleaner_names):
|
| 12 |
-
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
| 13 |
-
Args:
|
| 14 |
-
text: string to convert to a sequence
|
| 15 |
-
cleaner_names: names of the cleaner functions to run the text through
|
| 16 |
-
Returns:
|
| 17 |
-
List of integers corresponding to the symbols in the text
|
| 18 |
-
'''
|
| 19 |
-
sequence = []
|
| 20 |
-
|
| 21 |
-
clean_text = _clean_text(text, cleaner_names)
|
| 22 |
-
for symbol in clean_text:
|
| 23 |
-
symbol_id = _symbol_to_id[symbol]
|
| 24 |
-
sequence += [symbol_id]
|
| 25 |
-
return sequence
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def cleaned_text_to_sequence(cleaned_text):
|
| 29 |
-
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
| 30 |
-
Args:
|
| 31 |
-
text: string to convert to a sequence
|
| 32 |
-
Returns:
|
| 33 |
-
List of integers corresponding to the symbols in the text
|
| 34 |
-
'''
|
| 35 |
-
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
|
| 36 |
-
return sequence
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def sequence_to_text(sequence):
|
| 40 |
-
'''Converts a sequence of IDs back to a string'''
|
| 41 |
-
result = ''
|
| 42 |
-
for symbol_id in sequence:
|
| 43 |
-
s = _id_to_symbol[symbol_id]
|
| 44 |
-
result += s
|
| 45 |
-
return result
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
def _clean_text(text, cleaner_names):
|
| 49 |
-
for name in cleaner_names:
|
| 50 |
-
cleaner = getattr(cleaners, name)
|
| 51 |
-
if not cleaner:
|
| 52 |
-
raise Exception('Unknown cleaner: %s' % name)
|
| 53 |
-
text = cleaner(text)
|
| 54 |
-
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text/cleaners.py
DELETED
|
@@ -1,100 +0,0 @@
|
|
| 1 |
-
""" from https://github.com/keithito/tacotron """
|
| 2 |
-
|
| 3 |
-
'''
|
| 4 |
-
Cleaners are transformations that run over the input text at both training and eval time.
|
| 5 |
-
|
| 6 |
-
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
| 7 |
-
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
| 8 |
-
1. "english_cleaners" for English text
|
| 9 |
-
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
| 10 |
-
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
| 11 |
-
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
| 12 |
-
the symbols in symbols.py to match your data).
|
| 13 |
-
'''
|
| 14 |
-
|
| 15 |
-
import re
|
| 16 |
-
from unidecode import unidecode
|
| 17 |
-
from phonemizer import phonemize
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
# Regular expression matching whitespace:
|
| 21 |
-
_whitespace_re = re.compile(r'\s+')
|
| 22 |
-
|
| 23 |
-
# List of (regular expression, replacement) pairs for abbreviations:
|
| 24 |
-
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
| 25 |
-
('mrs', 'misess'),
|
| 26 |
-
('mr', 'mister'),
|
| 27 |
-
('dr', 'doctor'),
|
| 28 |
-
('st', 'saint'),
|
| 29 |
-
('co', 'company'),
|
| 30 |
-
('jr', 'junior'),
|
| 31 |
-
('maj', 'major'),
|
| 32 |
-
('gen', 'general'),
|
| 33 |
-
('drs', 'doctors'),
|
| 34 |
-
('rev', 'reverend'),
|
| 35 |
-
('lt', 'lieutenant'),
|
| 36 |
-
('hon', 'honorable'),
|
| 37 |
-
('sgt', 'sergeant'),
|
| 38 |
-
('capt', 'captain'),
|
| 39 |
-
('esq', 'esquire'),
|
| 40 |
-
('ltd', 'limited'),
|
| 41 |
-
('col', 'colonel'),
|
| 42 |
-
('ft', 'fort'),
|
| 43 |
-
]]
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def expand_abbreviations(text):
|
| 47 |
-
for regex, replacement in _abbreviations:
|
| 48 |
-
text = re.sub(regex, replacement, text)
|
| 49 |
-
return text
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
def expand_numbers(text):
|
| 53 |
-
return normalize_numbers(text)
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def lowercase(text):
|
| 57 |
-
return text.lower()
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
def collapse_whitespace(text):
|
| 61 |
-
return re.sub(_whitespace_re, ' ', text)
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def convert_to_ascii(text):
|
| 65 |
-
return unidecode(text)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
def basic_cleaners(text):
|
| 69 |
-
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
| 70 |
-
text = lowercase(text)
|
| 71 |
-
text = collapse_whitespace(text)
|
| 72 |
-
return text
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def transliteration_cleaners(text):
|
| 76 |
-
'''Pipeline for non-English text that transliterates to ASCII.'''
|
| 77 |
-
text = convert_to_ascii(text)
|
| 78 |
-
text = lowercase(text)
|
| 79 |
-
text = collapse_whitespace(text)
|
| 80 |
-
return text
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def english_cleaners(text):
|
| 84 |
-
'''Pipeline for English text, including abbreviation expansion.'''
|
| 85 |
-
text = convert_to_ascii(text)
|
| 86 |
-
text = lowercase(text)
|
| 87 |
-
text = expand_abbreviations(text)
|
| 88 |
-
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
|
| 89 |
-
phonemes = collapse_whitespace(phonemes)
|
| 90 |
-
return phonemes
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
def english_cleaners2(text):
|
| 94 |
-
'''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
|
| 95 |
-
text = convert_to_ascii(text)
|
| 96 |
-
text = lowercase(text)
|
| 97 |
-
text = expand_abbreviations(text)
|
| 98 |
-
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
|
| 99 |
-
phonemes = collapse_whitespace(phonemes)
|
| 100 |
-
return phonemes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text/symbols.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
""" from https://github.com/keithito/tacotron """
|
| 2 |
-
|
| 3 |
-
'''
|
| 4 |
-
Defines the set of symbols used in text input to the model.
|
| 5 |
-
'''
|
| 6 |
-
_pad = '_'
|
| 7 |
-
_punctuation = ';:,.!?¡¿—…"«»“” '
|
| 8 |
-
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
| 9 |
-
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# Export all symbols:
|
| 13 |
-
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
| 14 |
-
|
| 15 |
-
# Special symbol ids
|
| 16 |
-
SPACE_ID = symbols.index(" ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|