Spaces:

hzrr
/

sovits_datealive

Runtime error

App Files Files Community

hzrr commited on Jan 20, 2023

Commit

2b37c27

1 Parent(s): 62f6e75

..

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +0 -32
LICENSE +0 -21
README.md +4 -4
app.py +60 -66
attentions.py +294 -286
commons.py +99 -100
configs/yilanqiu.json → config.json +15 -10
configs/nyarumul.json +0 -53
configs/nyarusing.json +0 -52
data.py +0 -36
data_utils.py +12 -14
hubert/__init__.py +0 -8
hubert/__pycache__/__init__.cpython-38.pyc +0 -0
hubert/__pycache__/model.cpython-38.pyc +0 -0
hubert/dataset.py +0 -91
hubert/utils.py +0 -58
hubert/model.py → hubert_model.py +25 -91
icassp2022_vocal_transcription/.gitignore +0 -3
icassp2022_vocal_transcription/README.md +0 -56
icassp2022_vocal_transcription/__init__.py +0 -3
icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/data/weight_ST.hdf5 +0 -3
icassp2022_vocal_transcription/data/x_train_mean.npy +0 -3
icassp2022_vocal_transcription/data/x_train_std.npy +0 -3
icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png +0 -0
icassp2022_vocal_transcription/img/example_pop1_midi.png +0 -0
icassp2022_vocal_transcription/requirements.txt +0 -8
icassp2022_vocal_transcription/src/MIDI.py +0 -141
icassp2022_vocal_transcription/src/__init__.py +0 -0
icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc +0 -0
icassp2022_vocal_transcription/src/featureExtraction.py +0 -61
icassp2022_vocal_transcription/src/model.py +0 -139
icassp2022_vocal_transcription/src/quantization.py +0 -217
icassp2022_vocal_transcription/src/singing_transcription.py +0 -147
icassp2022_vocal_transcription/src/utils.py +0 -49
infer_tool.py +132 -57
models.py +9 -15
modules.py +282 -284
preprocess_wave.py +118 -0
requirements.txt +7 -7
text/LICENSE +0 -19
text/__init__.py +0 -54
text/cleaners.py +0 -100
text/symbols.py +0 -16

.gitattributes DELETED Viewed

@@ -1,32 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-icassp2022_vocal_transcription/data/weight_ST.hdf5 filter=lfs diff=lfs merge=lfs -text

LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) 2021 Jaehyeon Kim
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Sovits Midi Dev
-emoji: 🐨
-colorFrom: blue
-colorTo: red
 sdk: gradio
 sdk_version: 3.4
 app_file: app.py

 ---
+title: Sovits F0
+emoji: 🚀
+colorFrom: purple
+colorTo: gray
 sdk: gradio
 sdk_version: 3.4
 app_file: app.py

app.py CHANGED Viewed

@@ -1,77 +1,45 @@
-import logging
 import gradio as gr
 import torch
-import torchaudio
-import hubert
-import icassp2022_vocal_transcription
 import infer_tool
-import utils
-from models import SynthesizerTrn
-dev = torch.device("cpu")
-numba_logger = logging.getLogger('numba')
-numba_logger.setLevel(logging.WARNING)
 convert_cnt = [0]
-hps_ms = utils.get_hparams_from_file("configs/yilanqiu.json")
-net_g_ms = SynthesizerTrn(
-    178,
-    hps_ms.data.filter_length // 2 + 1,
-    hps_ms.train.segment_size // hps_ms.data.hop_length,
-    n_speakers=hps_ms.data.n_speakers,
-    **hps_ms.model)
-hubert_soft = hubert.hubert_soft('hubert.pt')
-_ = utils.load_checkpoint("1121_epochs.pth", net_g_ms, None)
-_ = net_g_ms.eval().to(dev)
 def vc_fn(sid, audio_record, audio_upload, tran):
     if audio_upload is not None:
         audio_path = audio_upload
     elif audio_record is not None:
         audio_path = audio_record
     else:
-        return "你需要上传wav文件或自行录音", None
-    target_sample = hps_ms.data.sampling_rate
-    audio_path = infer_tool.wav_resample(audio_path, target_sample)
-    audio, sampling_rate = torchaudio.load(audio_path)
     duration = audio.shape[0] / sampling_rate
-    if duration > 45:
-        return "请上传小于45s的音频，需要转换长音频请使用colab", None
-    soft = infer_tool.get_units(audio_path, hubert_soft).squeeze(0).cpu().numpy()
-    pitch = icassp2022_vocal_transcription.transcribe(audio_path)
-    pitch[pitch != 0] = pitch[pitch != 0] + tran
-    if tran == 100:
-        pitch[:] = 0
-    pitch = infer_tool.resize2d_plus(pitch, len(soft[:, 0]))
-    pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
-    sid = torch.LongTensor([2]).to(dev) if sid == "" else torch.LongTensor([1]).to(dev)
-    stn_tst = torch.FloatTensor(soft)
-    with torch.no_grad():
-        x_tst = stn_tst.unsqueeze(0).to(dev)
-        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
-        audio = net_g_ms.infer(x_tst, x_tst_lengths, pitch=pitch, sid=sid, noise_scale=0.3,
-                               noise_scale_w=0.1, length_scale=1)[0][0, 0].data.float().cpu().numpy()
-    convert_cnt[0] += 1
-    print(convert_cnt[0])
-    return "Success", (hps_ms.data.sampling_rate, audio)
-character_dict = {
-    "夜刀神十香": 1,
-    "鸢一折纸": 2,
-    "时崎狂三": 3,
-    "冰芽川四糸乃": 4,
-    "五河琴里": 5,
-    "八舞夕弦": 6,
-    "八舞耶俱矢": 7,
-    "诱宵美九": 8,
-}
 app = gr.Blocks()
@@ -79,26 +47,52 @@ with app:
     with gr.Tabs():
         with gr.TabItem("Basic"):
             gr.Markdown(value="""
-            本模型为sovits_midi（专供语音合成，为下面git的dev分支）
-            本hug仅供一键秋秋人使用（有语音授权，但是二创不要创死主播）
-            支持**45s以内**的**无伴奏wav格式**，或使用**网页内置**的录音（二选一），转换效果取决于源音频语气、节奏是否与目标音色相近。
-            如：女声歌曲转换，相似度远小于男声转换
-            该模型的 [github仓库链接](https://github.com/innnky/so-vits-svc)
-            如果想自己制作并训练模型可以访问这个 [github仓库](https://github.com/IceKyrin/sovits_guide)
             """)
-            speaker_id = gr.Dropdown(label="音色", choices=list(character_dict.keys()))
             record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
             upload_input = gr.Audio(source="upload", label="上传音频（长度小于45秒）", type="filepath",
                                     elem_id="audio_inputs")
-            vc_transform = gr.Number(label="变调（整数，可以正负，半音数量，升高八度就是12）", value=0)
             vc_submit = gr.Button("转换", variant="primary")
             out_message = gr.Textbox(label="Output Message")
             out_audio = gr.Audio(label="Output Audio")
-        vc_submit.click(vc_fn, [character_dict[speaker_id], record_input, upload_input, vc_transform], [out_message, out_audio])
     app.launch()

+import time
 import gradio as gr
+import soundfile
 import torch
 import infer_tool
 convert_cnt = [0]
+dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_name = "152_epochs.pth"
+config_name = "nyarumul.json"
+net_g_ms, hubert_soft, feature_input, hps_ms = infer_tool.load_model(f"{model_name}", f"configs/{config_name}")
+# 获取config参数
+target_sample = hps_ms.data.sampling_rate
+spk_dict = {
+    "奕兰秋": 4
+}
 def vc_fn(sid, audio_record, audio_upload, tran):
+    print(sid, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
     if audio_upload is not None:
         audio_path = audio_upload
     elif audio_record is not None:
         audio_path = audio_record
     else:
+        return "你需要上传wav文件或使用网页内置的录音！", None
+    audio, sampling_rate = infer_tool.format_wav(audio_path, target_sample)
     duration = audio.shape[0] / sampling_rate
+    if duration > 60:
+        return "请上传小于60s的音频，需要转换长音频请使用colab", None
+    o_audio, out_sr = infer_tool.infer(audio_path, spk_dict[sid], tran, net_g_ms, hubert_soft, feature_input)
+    out_path = f"./out_temp.wav"
+    soundfile.write(out_path, o_audio, target_sample)
+    infer_tool.f0_plt(audio_path, out_path, tran, hubert_soft, feature_input)
+    mistake, var = infer_tool.calc_error(audio_path, out_path, tran, feature_input)
+    return f"分段误差参考：0.3优秀，0.5左右合理，少量0.8-1可以接受\n若偏差过大，请调整升降半音数；多次调整均过大、说明超出歌手音域\n半音偏差：{mistake}\n半音方差：{var}", (
+        target_sample, o_audio), gr.Image.update("temp.jpg")
 app = gr.Blocks()
     with gr.Tabs():
         with gr.TabItem("Basic"):
             gr.Markdown(value="""
+            本音源有授权，二创不创死主播即可。[其他音色体验](https://huggingface.co/spaces/innnky/nyaru-svc2.0-advanced)
+            本模型为sovits_f0，支持**60s以内**的**无伴奏**wav、mp3格式，或使用**网页内置**的录音（二选一）
+            **error就用格式工厂自行转换为wav再上传**
+            转换效果取决于源音频语气、节奏是否与目标音色相近。
+            源音频为女声时，**建议降3-6key**，**最后的输出误差越接近0，音准越高**
+            源音频为**低音男声**时，**建议升3key，具���看曲线图情况**
+            f0曲线可以直观的显示跑调情况，蓝色为输入音高，橙色为合成音频的音高
+            若**只看见橙色**，说明蓝色曲线被覆盖，转换效果较好
             """)
+            speaker_id = gr.Dropdown(label="音色", choices=["奕兰秋"], value="奕兰秋")
             record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
             upload_input = gr.Audio(source="upload", label="上传音频（长度小于45秒）", type="filepath",
                                     elem_id="audio_inputs")
+            vc_transform = gr.Number(label="升降半音（整数，可以正负，半音数量，升高八度就是12）", value=0)
             vc_submit = gr.Button("转换", variant="primary")
             out_message = gr.Textbox(label="Output Message")
             out_audio = gr.Audio(label="Output Audio")
+            f0_image = gr.Image(label="f0曲线")
+        vc_submit.click(vc_fn, [speaker_id, record_input, upload_input, vc_transform],
+                        [out_message, out_audio, f0_image])
+        with gr.TabItem("使用说明"):
+            gr.Markdown(value="""
+                        0、合集：https://github.com/IceKyrin/sovits_guide/blob/main/README.md
+                        1、仅支持sovit_f0（sovits2.0）模型
+                        2、自行下载hubert-soft-0d54a1f4.pt改名为hubert.pt（已经下好了）
+                            https://github.com/bshall/hubert/releases/tag/v0.1
+                        3、pth文件夹下放置sovits2.0的模型
+                        4、与模型配套的xxx.json，需有speaker项——人物列表
+                        5、放无伴奏的音频、或网页内置录音，不要放奇奇怪怪的格式
+                        6、仅供交流使用，不对用户行为负责
+                        7、268000为44100预模型，配合sovits_pre.json；50000为22050预模型，配合nyarumul.json
+                        """)
     app.launch()

attentions.py CHANGED Viewed

@@ -1,303 +1,311 @@
-import copy
 import math
-import numpy as np
 import torch
 from torch import nn
-from torch.nn import functional as F
 import commons
-import modules
 from modules import LayerNorm
 class Encoder(nn.Module):
-  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
-    super().__init__()
-    self.hidden_channels = hidden_channels
-    self.filter_channels = filter_channels
-    self.n_heads = n_heads
-    self.n_layers = n_layers
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.window_size = window_size
-    self.drop = nn.Dropout(p_dropout)
-    self.attn_layers = nn.ModuleList()
-    self.norm_layers_1 = nn.ModuleList()
-    self.ffn_layers = nn.ModuleList()
-    self.norm_layers_2 = nn.ModuleList()
-    for i in range(self.n_layers):
-      self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
-      self.norm_layers_1.append(LayerNorm(hidden_channels))
-      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
-      self.norm_layers_2.append(LayerNorm(hidden_channels))
-  def forward(self, x, x_mask):
-    attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-    x = x * x_mask
-    for i in range(self.n_layers):
-      y = self.attn_layers[i](x, x, attn_mask)
-      y = self.drop(y)
-      x = self.norm_layers_1[i](x + y)
-      y = self.ffn_layers[i](x, x_mask)
-      y = self.drop(y)
-      x = self.norm_layers_2[i](x + y)
-    x = x * x_mask
-    return x
 class Decoder(nn.Module):
-  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
-    super().__init__()
-    self.hidden_channels = hidden_channels
-    self.filter_channels = filter_channels
-    self.n_heads = n_heads
-    self.n_layers = n_layers
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.proximal_bias = proximal_bias
-    self.proximal_init = proximal_init
-    self.drop = nn.Dropout(p_dropout)
-    self.self_attn_layers = nn.ModuleList()
-    self.norm_layers_0 = nn.ModuleList()
-    self.encdec_attn_layers = nn.ModuleList()
-    self.norm_layers_1 = nn.ModuleList()
-    self.ffn_layers = nn.ModuleList()
-    self.norm_layers_2 = nn.ModuleList()
-    for i in range(self.n_layers):
-      self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
-      self.norm_layers_0.append(LayerNorm(hidden_channels))
-      self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
-      self.norm_layers_1.append(LayerNorm(hidden_channels))
-      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
-      self.norm_layers_2.append(LayerNorm(hidden_channels))
-  def forward(self, x, x_mask, h, h_mask):
-    """
-    x: decoder input
-    h: encoder output
-    """
-    self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
-    encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-    x = x * x_mask
-    for i in range(self.n_layers):
-      y = self.self_attn_layers[i](x, x, self_attn_mask)
-      y = self.drop(y)
-      x = self.norm_layers_0[i](x + y)
-      y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
-      y = self.drop(y)
-      x = self.norm_layers_1[i](x + y)
-      y = self.ffn_layers[i](x, x_mask)
-      y = self.drop(y)
-      x = self.norm_layers_2[i](x + y)
-    x = x * x_mask
-    return x
 class MultiHeadAttention(nn.Module):
-  def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
-    super().__init__()
-    assert channels % n_heads == 0
-    self.channels = channels
-    self.out_channels = out_channels
-    self.n_heads = n_heads
-    self.p_dropout = p_dropout
-    self.window_size = window_size
-    self.heads_share = heads_share
-    self.block_length = block_length
-    self.proximal_bias = proximal_bias
-    self.proximal_init = proximal_init
-    self.attn = None
-    self.k_channels = channels // n_heads
-    self.conv_q = nn.Conv1d(channels, channels, 1)
-    self.conv_k = nn.Conv1d(channels, channels, 1)
-    self.conv_v = nn.Conv1d(channels, channels, 1)
-    self.conv_o = nn.Conv1d(channels, out_channels, 1)
-    self.drop = nn.Dropout(p_dropout)
-    if window_size is not None:
-      n_heads_rel = 1 if heads_share else n_heads
-      rel_stddev = self.k_channels**-0.5
-      self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-      self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-    nn.init.xavier_uniform_(self.conv_q.weight)
-    nn.init.xavier_uniform_(self.conv_k.weight)
-    nn.init.xavier_uniform_(self.conv_v.weight)
-    if proximal_init:
-      with torch.no_grad():
-        self.conv_k.weight.copy_(self.conv_q.weight)
-        self.conv_k.bias.copy_(self.conv_q.bias)
-  def forward(self, x, c, attn_mask=None):
-    q = self.conv_q(x)
-    k = self.conv_k(c)
-    v = self.conv_v(c)
-    x, self.attn = self.attention(q, k, v, mask=attn_mask)
-    x = self.conv_o(x)
-    return x
-  def attention(self, query, key, value, mask=None):
-    # reshape [b, d, t] -> [b, n_h, t, d_k]
-    b, d, t_s, t_t = (*key.size(), query.size(2))
-    query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-    key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
-    if self.window_size is not None:
-      assert t_s == t_t, "Relative attention is only available for self-attention."
-      key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-      rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
-      scores_local = self._relative_position_to_absolute_position(rel_logits)
-      scores = scores + scores_local
-    if self.proximal_bias:
-      assert t_s == t_t, "Proximal bias is only available for self-attention."
-      scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
-    if mask is not None:
-      scores = scores.masked_fill(mask == 0, -1e4)
-      if self.block_length is not None:
-        assert t_s == t_t, "Local attention is only available for self-attention."
-        block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
-        scores = scores.masked_fill(block_mask == 0, -1e4)
-    p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
-    p_attn = self.drop(p_attn)
-    output = torch.matmul(p_attn, value)
-    if self.window_size is not None:
-      relative_weights = self._absolute_position_to_relative_position(p_attn)
-      value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
-      output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
-    output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
-    return output, p_attn
-  def _matmul_with_relative_values(self, x, y):
-    """
-    x: [b, h, l, m]
-    y: [h or 1, m, d]
-    ret: [b, h, l, d]
-    """
-    ret = torch.matmul(x, y.unsqueeze(0))
-    return ret
-  def _matmul_with_relative_keys(self, x, y):
-    """
-    x: [b, h, l, d]
-    y: [h or 1, m, d]
-    ret: [b, h, l, m]
-    """
-    ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-    return ret
-  def _get_relative_embeddings(self, relative_embeddings, length):
-    max_relative_position = 2 * self.window_size + 1
-    # Pad first before slice to avoid using cond ops.
-    pad_length = max(length - (self.window_size + 1), 0)
-    slice_start_position = max((self.window_size + 1) - length, 0)
-    slice_end_position = slice_start_position + 2 * length - 1
-    if pad_length > 0:
-      padded_relative_embeddings = F.pad(
-          relative_embeddings,
-          commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
-    else:
-      padded_relative_embeddings = relative_embeddings
-    used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
-    return used_relative_embeddings
-  def _relative_position_to_absolute_position(self, x):
-    """
-    x: [b, h, l, 2*l-1]
-    ret: [b, h, l, l]
-    """
-    batch, heads, length, _ = x.size()
-    # Concat columns of pad to shift from relative to absolute indexing.
-    x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
-    # Concat extra elements so to add up to shape (len+1, 2*len-1).
-    x_flat = x.view([batch, heads, length * 2 * length])
-    x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
-    # Reshape and slice out the padded elements.
-    x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
-    return x_final
-  def _absolute_position_to_relative_position(self, x):
-    """
-    x: [b, h, l, l]
-    ret: [b, h, l, 2*l-1]
-    """
-    batch, heads, length, _ = x.size()
-    # padd along column
-    x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
-    x_flat = x.view([batch, heads, length**2 + length*(length -1)])
-    # add 0's in the beginning that will skew the elements after reshape
-    x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
-    x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
-    return x_final
-  def _attention_bias_proximal(self, length):
-    """Bias for self-attention to encourage attention to close positions.
-    Args:
-      length: an integer scalar.
-    Returns:
-      a Tensor with shape [1, 1, length, length]
-    """
-    r = torch.arange(length, dtype=torch.float32)
-    diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-    return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
 class FFN(nn.Module):
-  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
-    super().__init__()
-    self.in_channels = in_channels
-    self.out_channels = out_channels
-    self.filter_channels = filter_channels
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.activation = activation
-    self.causal = causal
-    if causal:
-      self.padding = self._causal_padding
-    else:
-      self.padding = self._same_padding
-    self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
-    self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
-    self.drop = nn.Dropout(p_dropout)
-  def forward(self, x, x_mask):
-    x = self.conv_1(self.padding(x * x_mask))
-    if self.activation == "gelu":
-      x = x * torch.sigmoid(1.702 * x)
-    else:
-      x = torch.relu(x)
-    x = self.drop(x)
-    x = self.conv_2(self.padding(x * x_mask))
-    return x * x_mask
-  def _causal_padding(self, x):
-    if self.kernel_size == 1:
-      return x
-    pad_l = self.kernel_size - 1
-    pad_r = 0
-    padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-    x = F.pad(x, commons.convert_pad_shape(padding))
-    return x
-  def _same_padding(self, x):
-    if self.kernel_size == 1:
-      return x
-    pad_l = (self.kernel_size - 1) // 2
-    pad_r = self.kernel_size // 2
-    padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-    x = F.pad(x, commons.convert_pad_shape(padding))
-    return x

 import math
 import torch
 from torch import nn
+from torch.nn import functional as t_func
 import commons
 from modules import LayerNorm
 class Encoder(nn.Module):
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4,
+                 **kwargs):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
+                                                       window_size=window_size))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
 class Decoder(nn.Module):
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
+                 proximal_bias=False, proximal_init=True, **kwargs):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.encdec_attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(
+                MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
+                                   proximal_bias=proximal_bias, proximal_init=proximal_init))
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.encdec_attn_layers.append(
+                MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask, h, h_mask):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
+        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+            y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
 class MultiHeadAttention(nn.Module):
+    def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True,
+                 block_length=None, proximal_bias=False, proximal_init=False):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels ** -0.5
+            self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert t_s == t_t, "Local attention is only available for self-attention."
+                block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = t_func.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = t_func.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
+        x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
 class FFN(nn.Module):
+    def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None,
+                 causal=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = t_func.pad(x, commons.convert_pad_shape(padding))
+        return x
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = t_func.pad(x, commons.convert_pad_shape(padding))
+        return x

commons.py CHANGED Viewed

@@ -1,161 +1,160 @@
 import math
-import numpy as np
 import torch
-from torch import nn
-from torch.nn import functional as F
 def init_weights(m, mean=0.0, std=0.01):
-  classname = m.__class__.__name__
-  if classname.find("Conv") != -1:
-    m.weight.data.normal_(mean, std)
 def get_padding(kernel_size, dilation=1):
-  return int((kernel_size*dilation - dilation)/2)
 def convert_pad_shape(pad_shape):
-  l = pad_shape[::-1]
-  pad_shape = [item for sublist in l for item in sublist]
-  return pad_shape
 def intersperse(lst, item):
-  result = [item] * (len(lst) * 2 + 1)
-  result[1::2] = lst
-  return result
 def kl_divergence(m_p, logs_p, m_q, logs_q):
-  """KL(P||Q)"""
-  kl = (logs_q - logs_p) - 0.5
-  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
-  return kl
 def rand_gumbel(shape):
-  """Sample from the Gumbel distribution, protect from overflows."""
-  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
-  return -torch.log(-torch.log(uniform_samples))
 def rand_gumbel_like(x):
-  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
-  return g
 def slice_segments(x, ids_str, segment_size=4):
-  ret = torch.zeros_like(x[:, :, :segment_size])
-  for i in range(x.size(0)):
-    idx_str = ids_str[i]
-    idx_end = idx_str + segment_size
-    ret[i] = x[i, :, idx_str:idx_end]
-  return ret
 def rand_slice_segments(x, x_lengths=None, segment_size=4):
-  b, d, t = x.size()
-  if x_lengths is None:
-    x_lengths = t
-  ids_str_max = x_lengths - segment_size + 1
-  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
-  ret = slice_segments(x, ids_str, segment_size)
-  return ret, ids_str
 def get_timing_signal_1d(
-    length, channels, min_timescale=1.0, max_timescale=1.0e4):
-  position = torch.arange(length, dtype=torch.float)
-  num_timescales = channels // 2
-  log_timescale_increment = (
-      math.log(float(max_timescale) / float(min_timescale)) /
-      (num_timescales - 1))
-  inv_timescales = min_timescale * torch.exp(
-      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
-  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
-  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
-  signal = F.pad(signal, [0, 0, 0, channels % 2])
-  signal = signal.view(1, channels, length)
-  return signal
 def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
-  b, channels, length = x.size()
-  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
-  return x + signal.to(dtype=x.dtype, device=x.device)
 def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
-  b, channels, length = x.size()
-  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
-  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 def subsequent_mask(length):
-  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
-  return mask
 @torch.jit.script
 def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-  n_channels_int = n_channels[0]
-  in_act = input_a + input_b
-  t_act = torch.tanh(in_act[:, :n_channels_int, :])
-  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-  acts = t_act * s_act
-  return acts
 def convert_pad_shape(pad_shape):
-  l = pad_shape[::-1]
-  pad_shape = [item for sublist in l for item in sublist]
-  return pad_shape
 def shift_1d(x):
-  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
-  return x
 def sequence_mask(length, max_length=None):
-  if max_length is None:
-    max_length = length.max()
-  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-  return x.unsqueeze(0) < length.unsqueeze(1)
 def generate_path(duration, mask):
-  """
-  duration: [b, 1, t_x]
-  mask: [b, 1, t_y, t_x]
-  """
-  device = duration.device
-  b, _, t_y, t_x = mask.shape
-  cum_duration = torch.cumsum(duration, -1)
-  cum_duration_flat = cum_duration.view(b * t_x)
-  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
-  path = path.view(b, t_x, t_y)
-  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
-  path = path.unsqueeze(1).transpose(2,3) * mask
-  return path
 def clip_grad_value_(parameters, clip_value, norm_type=2):
-  if isinstance(parameters, torch.Tensor):
-    parameters = [parameters]
-  parameters = list(filter(lambda p: p.grad is not None, parameters))
-  norm_type = float(norm_type)
-  if clip_value is not None:
-    clip_value = float(clip_value)
-  total_norm = 0
-  for p in parameters:
-    param_norm = p.grad.data.norm(norm_type)
-    total_norm += param_norm.item() ** norm_type
     if clip_value is not None:
-      p.grad.data.clamp_(min=-clip_value, max=clip_value)
-  total_norm = total_norm ** (1. / norm_type)
-  return total_norm

 import math
 import torch
+from torch.nn import functional as t_func
 def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
 def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
 def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
 def intersperse(lst, item):
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
 def kl_divergence(m_p, logs_p, m_q, logs_q):
+    """KL(P||Q)"""
+    kl = (logs_q - logs_p) - 0.5
+    kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2. * logs_q)
+    return kl
 def rand_gumbel(shape):
+    """Sample from the Gumbel distribution, protect from overflows."""
+    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+    return -torch.log(-torch.log(uniform_samples))
 def rand_gumbel_like(x):
+    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+    return g
 def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
 def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
 def get_timing_signal_1d(
+        length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    position = torch.arange(length, dtype=torch.float)
+    num_timescales = channels // 2
+    log_timescale_increment = (
+            math.log(float(max_timescale) / float(min_timescale)) /
+            (num_timescales - 1))
+    inv_timescales = min_timescale * torch.exp(
+        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+    signal = t_func.pad(signal, [0, 0, 0, channels % 2])
+    signal = signal.view(1, channels, length)
+    return signal
 def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return x + signal.to(dtype=x.dtype, device=x.device)
 def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
 @torch.jit.script
 def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
 def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
 def shift_1d(x):
+    x = t_func.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+    return x
 def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
 def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - t_func.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
 def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda para: para.grad is not None, parameters))
+    norm_type = float(norm_type)
     if clip_value is not None:
+        clip_value = float(clip_value)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1. / norm_type)
+    return total_norm

configs/yilanqiu.json → config.json RENAMED Viewed

@@ -1,7 +1,7 @@
 {
   "train": {
     "log_interval": 200,
-    "eval_interval": 2000,
     "seed": 1234,
     "epochs": 10000,
     "learning_rate": 2e-4,
@@ -10,7 +10,7 @@
       0.99
     ],
     "eps": 1e-9,
-    "batch_size": 16,
     "fp16_run": true,
     "lr_decay": 0.999875,
     "segment_size": 8192,
@@ -20,8 +20,8 @@
     "c_kl": 1.0
   },
   "data": {
-    "training_files": "/root/content/qiu/train.txt",
-    "validation_files": "/root/content/qiu/val.txt",
     "text_cleaners": [
       "english_cleaners2"
     ],
@@ -34,10 +34,10 @@
     "mel_fmin": 0.0,
     "mel_fmax": null,
     "add_blank": true,
-    "n_speakers": 3,
-    "cleaned_text": true
   },
   "model": {
     "inter_channels": 192,
     "hidden_channels": 256,
     "filter_channels": 768,
@@ -86,8 +86,13 @@
     "gin_channels": 256
   },
   "speakers": [
-    "maolei",
-    "opencpop",
-    "yilanqiu"
   ]
-}

 {
   "train": {
     "log_interval": 200,
+    "eval_interval": 5000,
     "seed": 1234,
     "epochs": 10000,
     "learning_rate": 2e-4,
       0.99
     ],
     "eps": 1e-9,
+    "batch_size": 32,
     "fp16_run": true,
     "lr_decay": 0.999875,
     "segment_size": 8192,
     "c_kl": 1.0
   },
   "data": {
+    "training_files": "./filelist/train.txt",
+    "validation_files": "./filelist/val.txt",
     "text_cleaners": [
       "english_cleaners2"
     ],
     "mel_fmin": 0.0,
     "mel_fmax": null,
     "add_blank": true,
+    "n_speakers": 8
   },
   "model": {
+    "sampling_rate": 22050,
     "inter_channels": 192,
     "hidden_channels": 256,
     "filter_channels": 768,
     "gin_channels": 256
   },
   "speakers": [
+    "zhezhi",
+    "kuangsan",
+    "sisinai",
+    "qinli",
+    "xixian",
+    "yejushi",
+    "meijiu",
+    "shixiang"
   ]
+}

configs/nyarumul.json DELETED Viewed

@@ -1,53 +0,0 @@
-{
-  "train": {
-    "log_interval": 200,
-    "eval_interval": 2000,
-    "seed": 1234,
-    "epochs": 10000,
-    "learning_rate": 2e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 16,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 8192,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "training_files":"/content/drive/MyDrive/SingingVC/trainmul.txt",
-    "validation_files":"/content/drive/MyDrive/SingingVC/valmul.txt",
-    "text_cleaners":["english_cleaners2"],
-    "max_wav_value": 32768.0,
-    "sampling_rate": 22050,
-    "filter_length": 1024,
-    "hop_length": 256,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "mel_fmin": 0.0,
-    "mel_fmax": null,
-    "add_blank": true,
-    "n_speakers": 3,
-    "cleaned_text": true
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 256,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0.1,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [8,8,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16,16,4,4],
-    "n_layers_q": 3,
-    "use_spectral_norm": false,
-    "gin_channels": 256
-  }
-}

configs/nyarusing.json DELETED Viewed

@@ -1,52 +0,0 @@
-{
-  "train": {
-    "log_interval": 200,
-    "eval_interval": 2000,
-    "seed": 1234,
-    "epochs": 20000,
-    "learning_rate": 2e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 24,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 8192,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "training_files":"/content/train.txt",
-    "validation_files":"/content/nyarusing/val.txt",
-    "text_cleaners":["english_cleaners2"],
-    "max_wav_value": 32768.0,
-    "sampling_rate": 22050,
-    "filter_length": 1024,
-    "hop_length": 256,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "mel_fmin": 0.0,
-    "mel_fmax": null,
-    "add_blank": true,
-    "n_speakers": 0,
-    "cleaned_text": true
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 256,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0.1,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [8,8,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16,16,4,4],
-    "n_layers_q": 3,
-    "use_spectral_norm": false
-  }
-}

data.py DELETED Viewed

@@ -1,36 +0,0 @@
-import os
-import numpy as np
-import icassp2022_vocal_transcription
-def resize2d(source, target_len):
-    source = source.astype(float)
-    source[source < 0.001] = np.nan
-    target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
-                       source)
-    res = np.nan_to_num(target)
-    ret = res[:].astype(int)
-    # 若调整大小时采样到中间的点，则以上一个点作为当前音高值
-    for i in range(len(res)):
-        if res[i] - ret[i] > 0.001:
-            ret[i] = ret[i - 1]
-    return ret
-def get_end_file(dir_path, end):
-    file_lists = []
-    for root, dirs, files in os.walk(dir_path):
-        for f_file in files:
-            if f_file.endswith(end):
-                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
-    return file_lists
-folder = "val"
-wav_paths = get_end_file(f"./qiu/wavs/{folder}/", "wav")
-for wav_path in wav_paths:
-    pitch = icassp2022_vocal_transcription.transcribe(wav_path)
-    soft = np.load(wav_path.replace("wavs", "soft").replace(".wav", ".npy"))
-    pitch = resize2d(pitch, len(soft[:, 0]))
-    np.save(wav_path.replace("wavs", "pitch").replace(".wav", ".npy"), pitch)

data_utils.py CHANGED Viewed

@@ -1,14 +1,12 @@
-import time
 import os
 import random
 import numpy as np
 import torch
 import torch.utils.data
-import numpy as np
-import commons
 from mel_processing import spectrogram_torch
 from utils import load_wav_to_torch, load_filepaths_and_text
-from text import text_to_sequence, cleaned_text_to_sequence
 def dropout1d(myarray, ratio=0.5):
@@ -59,11 +57,11 @@ class TextAudioLoader(torch.utils.data.Dataset):
     def get_audio_text_pair(self, audiopath_and_text):
         # separate filename and text
-        audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1],audiopath_and_text[2]
         text = self.get_text(text)
         spec, wav = self.get_audio(audiopath)
         pitch = self.get_pitch(pitch)
-        return (text, spec, wav, pitch)
     def get_pitch(self, pitch):
@@ -99,7 +97,7 @@ class TextAudioLoader(torch.utils.data.Dataset):
         return len(self.audiopaths_and_text)
-class TextAudioCollate():
     """ Zero-pads model inputs and targets
     """
@@ -123,7 +121,6 @@ class TextAudioCollate():
         max_pitch_len = max([x[3].shape[0] for x in batch])
         # print(batch)
         text_lengths = torch.LongTensor(len(batch))
         spec_lengths = torch.LongTensor(len(batch))
         wav_lengths = torch.LongTensor(len(batch))
@@ -205,13 +202,14 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
     def get_audio_text_speaker_pair(self, audiopath_sid_text):
         # separate filename, speaker_id and text
-        audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], audiopath_sid_text[3]
         text = self.get_text(text)
         spec, wav = self.get_audio(audiopath)
         sid = self.get_sid(sid)
         pitch = self.get_pitch(pitch)
-        return (text, spec, wav, pitch, sid)
     def get_audio(self, filename):
         audio, sampling_rate = load_wav_to_torch(filename)
@@ -235,7 +233,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         soft = np.load(text)
         text_norm = torch.FloatTensor(soft)
         return text_norm
     def get_pitch(self, pitch):
         return torch.LongTensor(np.load(pitch))
@@ -250,7 +248,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         return len(self.audiopaths_sid_text)
-class TextAudioSpeakerCollate():
     """ Zero-pads model inputs and targets
     """
@@ -310,7 +308,7 @@ class TextAudioSpeakerCollate():
         if self.return_ids:
             return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing
-        return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths,pitch_padded , sid
 class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
@@ -400,7 +398,7 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
         if hi > lo:
             mid = (hi + lo) // 2
-            if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
                 return mid
             elif x <= self.boundaries[mid]:
                 return self._bisect(x, lo, mid)

 import os
 import random
 import numpy as np
 import torch
 import torch.utils.data
 from mel_processing import spectrogram_torch
 from utils import load_wav_to_torch, load_filepaths_and_text
 def dropout1d(myarray, ratio=0.5):
     def get_audio_text_pair(self, audiopath_and_text):
         # separate filename and text
+        audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1], audiopath_and_text[2]
         text = self.get_text(text)
         spec, wav = self.get_audio(audiopath)
         pitch = self.get_pitch(pitch)
+        return text, spec, wav, pitch
     def get_pitch(self, pitch):
         return len(self.audiopaths_and_text)
+class TextAudioCollate:
     """ Zero-pads model inputs and targets
     """
         max_pitch_len = max([x[3].shape[0] for x in batch])
         # print(batch)
         text_lengths = torch.LongTensor(len(batch))
         spec_lengths = torch.LongTensor(len(batch))
         wav_lengths = torch.LongTensor(len(batch))
     def get_audio_text_speaker_pair(self, audiopath_sid_text):
         # separate filename, speaker_id and text
+        audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], \
+                                      audiopath_sid_text[3]
         text = self.get_text(text)
         spec, wav = self.get_audio(audiopath)
         sid = self.get_sid(sid)
         pitch = self.get_pitch(pitch)
+        return text, spec, wav, pitch, sid
     def get_audio(self, filename):
         audio, sampling_rate = load_wav_to_torch(filename)
         soft = np.load(text)
         text_norm = torch.FloatTensor(soft)
         return text_norm
     def get_pitch(self, pitch):
         return torch.LongTensor(np.load(pitch))
         return len(self.audiopaths_sid_text)
+class TextAudioSpeakerCollate:
     """ Zero-pads model inputs and targets
     """
         if self.return_ids:
             return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing
+        return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid
 class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
         if hi > lo:
             mid = (hi + lo) // 2
+            if self.boundaries[mid] < x <= self.boundaries[mid + 1]:
                 return mid
             elif x <= self.boundaries[mid]:
                 return self._bisect(x, lo, mid)

hubert/__init__.py DELETED Viewed

@@ -1,8 +0,0 @@
-from .model import (
-    Hubert,
-    HubertDiscrete,
-    HubertSoft,
-    hubert_discrete,
-    hubert_soft,
-    kmeans100,
-)

hubert/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (281 Bytes)

hubert/__pycache__/model.cpython-38.pyc DELETED Viewed

Binary file (10 kB)

hubert/dataset.py DELETED Viewed

@@ -1,91 +0,0 @@
-import random
-from pathlib import Path
-import numpy as np
-import json
-import torch
-import torch.nn.functional as F
-from torch.utils.data import Dataset
-import torchaudio
-class AcousticUnitsDataset(Dataset):
-    def __init__(
-        self,
-        root: Path,
-        sample_rate: int = 16000,
-        label_rate: int = 50,
-        min_samples: int = 32000,
-        max_samples: int = 250000,
-        train: bool = True,
-    ):
-        self.wavs_dir = root / "wavs"
-        self.units_dir = root / "units"
-        with open(root / "lengths.json") as file:
-            self.lenghts = json.load(file)
-        pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac"
-        metadata = (
-            (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix())
-            for path in self.wavs_dir.rglob(pattern)
-        )
-        metadata = ((path, key) for path, key in metadata if key in self.lenghts)
-        self.metadata = [
-            path for path, key in metadata if self.lenghts[key] > min_samples
-        ]
-        self.sample_rate = sample_rate
-        self.label_rate = label_rate
-        self.min_samples = min_samples
-        self.max_samples = max_samples
-        self.train = train
-    def __len__(self):
-        return len(self.metadata)
-    def __getitem__(self, index):
-        wav_path = self.metadata[index]
-        units_path = self.units_dir / wav_path.relative_to(self.wavs_dir)
-        wav, _ = torchaudio.load(wav_path)
-        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
-        codes = np.load(units_path.with_suffix(".npy"))
-        return wav, torch.from_numpy(codes).long()
-    def collate(self, batch):
-        wavs, codes = zip(*batch)
-        wavs, codes = list(wavs), list(codes)
-        wav_lengths = [wav.size(-1) for wav in wavs]
-        code_lengths = [code.size(-1) for code in codes]
-        wav_frames = min(self.max_samples, *wav_lengths)
-        collated_wavs, wav_offsets = [], []
-        for wav in wavs:
-            wav_diff = wav.size(-1) - wav_frames
-            wav_offset = random.randint(0, wav_diff)
-            wav = wav[:, wav_offset : wav_offset + wav_frames]
-            collated_wavs.append(wav)
-            wav_offsets.append(wav_offset)
-        rate = self.label_rate / self.sample_rate
-        code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets]
-        code_frames = round(wav_frames * rate)
-        remaining_code_frames = [
-            length - offset for length, offset in zip(code_lengths, code_offsets)
-        ]
-        code_frames = min(code_frames, *remaining_code_frames)
-        collated_codes = []
-        for code, code_offset in zip(codes, code_offsets):
-            code = code[code_offset : code_offset + code_frames]
-            collated_codes.append(code)
-        wavs = torch.stack(collated_wavs, dim=0)
-        codes = torch.stack(collated_codes, dim=0)
-        return wavs, codes

hubert/utils.py DELETED Viewed

@@ -1,58 +0,0 @@
-import torch
-class Metric:
-    def __init__(self):
-        self.steps = 0
-        self.value = 0
-    def update(self, value):
-        self.steps += 1
-        self.value += (value - self.value) / self.steps
-        return self.value
-    def reset(self):
-        self.steps = 0
-        self.value = 0
-def save_checkpoint(
-    checkpoint_dir,
-    hubert,
-    optimizer,
-    scaler,
-    step,
-    loss,
-    best,
-    logger,
-):
-    state = {
-        "hubert": hubert.state_dict(),
-        "optimizer": optimizer.state_dict(),
-        "scaler": scaler.state_dict(),
-        "step": step,
-        "loss": loss,
-    }
-    checkpoint_dir.mkdir(exist_ok=True, parents=True)
-    checkpoint_path = checkpoint_dir / f"model-{step}.pt"
-    torch.save(state, checkpoint_path)
-    if best:
-        best_path = checkpoint_dir / "model-best.pt"
-        torch.save(state, best_path)
-    logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
-def load_checkpoint(
-    load_path,
-    hubert,
-    optimizer,
-    scaler,
-    rank,
-    logger,
-):
-    logger.info(f"Loading checkpoint from {load_path}")
-    checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
-    hubert.load_state_dict(checkpoint["hubert"])
-    scaler.load_state_dict(checkpoint["scaler"])
-    optimizer.load_state_dict(checkpoint["optimizer"])
-    return checkpoint["step"], checkpoint["loss"]

hubert/model.py → hubert_model.py RENAMED Viewed

@@ -1,20 +1,12 @@
 import copy
-from typing import Optional, Tuple
 import random
-from sklearn.cluster import KMeans
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
-URLS = {
-    "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-discrete-e9416457.pt",
-    "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt",
-    "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.1/kmeans100-50f36a95.pt",
-}
 class Hubert(nn.Module):
     def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
@@ -44,7 +36,7 @@ class Hubert(nn.Module):
         return x, mask
     def encode(
-        self, x: torch.Tensor, layer: Optional[int] = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         x = self.feature_extractor(x)
         x = self.feature_projection(x.transpose(1, 2))
@@ -75,24 +67,11 @@ class HubertSoft(Hubert):
     @torch.inference_mode()
     def units(self, wav: torch.Tensor) -> torch.Tensor:
-        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
         x, _ = self.encode(wav)
         return self.proj(x)
-class HubertDiscrete(Hubert):
-    def __init__(self, kmeans):
-        super().__init__(504)
-        self.kmeans = kmeans
-    @torch.inference_mode()
-    def units(self, wav: torch.Tensor) -> torch.LongTensor:
-        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
-        x, _ = self.encode(wav, layer=7)
-        x = self.kmeans.predict(x.squeeze().cpu().numpy())
-        return torch.tensor(x, dtype=torch.long, device=wav.device)
 class FeatureExtractor(nn.Module):
     def __init__(self):
         super().__init__()
@@ -106,13 +85,13 @@ class FeatureExtractor(nn.Module):
         self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = F.gelu(self.norm0(self.conv0(x)))
-        x = F.gelu(self.conv1(x))
-        x = F.gelu(self.conv2(x))
-        x = F.gelu(self.conv3(x))
-        x = F.gelu(self.conv4(x))
-        x = F.gelu(self.conv5(x))
-        x = F.gelu(self.conv6(x))
         return x
@@ -144,13 +123,13 @@ class PositionalConvEmbedding(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.conv(x.transpose(1, 2))
-        x = F.gelu(x[:, :, :-1])
         return x.transpose(1, 2)
 class TransformerEncoder(nn.Module):
     def __init__(
-        self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
     ) -> None:
         super(TransformerEncoder, self).__init__()
         self.layers = nn.ModuleList(
@@ -159,11 +138,11 @@ class TransformerEncoder(nn.Module):
         self.num_layers = num_layers
     def forward(
-        self,
-        src: torch.Tensor,
-        mask: torch.Tensor = None,
-        src_key_padding_mask: torch.Tensor = None,
-        output_layer: Optional[int] = None,
     ) -> torch.Tensor:
         output = src
         for layer in self.layers[:output_layer]:
@@ -174,11 +153,11 @@ class TransformerEncoder(nn.Module):
 def _compute_mask(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    device: torch.device,
-    min_masks: int = 0,
 ) -> torch.Tensor:
     batch_size, sequence_length = shape
@@ -228,62 +207,17 @@ def _compute_mask(
     return mask
-def hubert_discrete(
-    pretrained: bool = True,
-    progress: bool = True,
-) -> HubertDiscrete:
-    r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
-    Args:
-        pretrained (bool): load pretrained weights into the model
-        progress (bool): show progress bar when downloading model
-    """
-    kmeans = kmeans100(pretrained=pretrained, progress=progress)
-    hubert = HubertDiscrete(kmeans)
-    if pretrained:
-        checkpoint = torch.hub.load_state_dict_from_url(
-            URLS["hubert-discrete"], progress=progress
-        )
-        consume_prefix_in_state_dict_if_present(checkpoint, "module.")
-        hubert.load_state_dict(checkpoint)
-        hubert.eval()
-    return hubert
 def hubert_soft(
-    path: str
 ) -> HubertSoft:
     r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
     Args:
         path (str): path of a pretrained model
     """
-    dev = torch.device("cpu" if torch.cuda.is_available() else "cpu")
     hubert = HubertSoft()
     checkpoint = torch.load(path)
     consume_prefix_in_state_dict_if_present(checkpoint, "module.")
     hubert.load_state_dict(checkpoint)
     hubert.eval().to(dev)
     return hubert
-def _kmeans(
-    num_clusters: int, pretrained: bool = True, progress: bool = True
-) -> KMeans:
-    kmeans = KMeans(num_clusters)
-    if pretrained:
-        checkpoint = torch.hub.load_state_dict_from_url(
-            URLS[f"kmeans{num_clusters}"], progress=progress
-        )
-        kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
-        kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
-        kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
-    return kmeans
-def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
-    r"""
-    k-means checkpoint for HuBERT-Discrete with 100 clusters.
-    Args:
-        pretrained (bool): load pretrained weights into the model
-        progress (bool): show progress bar when downloading model
-    """
-    return _kmeans(100, pretrained, progress)

 import copy
 import random
+from typing import Optional, Tuple
 import torch
 import torch.nn as nn
+import torch.nn.functional as t_func
 from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
 class Hubert(nn.Module):
     def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
         return x, mask
     def encode(
+            self, x: torch.Tensor, layer: Optional[int] = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         x = self.feature_extractor(x)
         x = self.feature_projection(x.transpose(1, 2))
     @torch.inference_mode()
     def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
         x, _ = self.encode(wav)
         return self.proj(x)
 class FeatureExtractor(nn.Module):
     def __init__(self):
         super().__init__()
         self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = t_func.gelu(self.norm0(self.conv0(x)))
+        x = t_func.gelu(self.conv1(x))
+        x = t_func.gelu(self.conv2(x))
+        x = t_func.gelu(self.conv3(x))
+        x = t_func.gelu(self.conv4(x))
+        x = t_func.gelu(self.conv5(x))
+        x = t_func.gelu(self.conv6(x))
         return x
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.conv(x.transpose(1, 2))
+        x = t_func.gelu(x[:, :, :-1])
         return x.transpose(1, 2)
 class TransformerEncoder(nn.Module):
     def __init__(
+            self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
     ) -> None:
         super(TransformerEncoder, self).__init__()
         self.layers = nn.ModuleList(
         self.num_layers = num_layers
     def forward(
+            self,
+            src: torch.Tensor,
+            mask: torch.Tensor = None,
+            src_key_padding_mask: torch.Tensor = None,
+            output_layer: Optional[int] = None,
     ) -> torch.Tensor:
         output = src
         for layer in self.layers[:output_layer]:
 def _compute_mask(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        device: torch.device,
+        min_masks: int = 0,
 ) -> torch.Tensor:
     batch_size, sequence_length = shape
     return mask
 def hubert_soft(
+        path: str
 ) -> HubertSoft:
     r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
     Args:
         path (str): path of a pretrained model
     """
+    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     hubert = HubertSoft()
     checkpoint = torch.load(path)
     consume_prefix_in_state_dict_if_present(checkpoint, "module.")
     hubert.load_state_dict(checkpoint)
     hubert.eval().to(dev)
     return hubert

icassp2022_vocal_transcription/.gitignore DELETED Viewed

@@ -1,3 +0,0 @@
-output/
-audio/*
-!audio/test.wav

icassp2022_vocal_transcription/README.md DELETED Viewed

@@ -1,56 +0,0 @@
-# icassp2022-vocal-transcription
-Companion code for the paper:
-Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, Juhan Nam *"Pseudo-Label Transfer from Frame-level to Note-level in a Teacher-student Framework for Singing Transcription from Polyphonic Music"*, ICASSP2022, Singapore <[link](https://ieeexplore.ieee.org/document/9747147)>
-## Abstract
-Lack of large-scale note-level labeled data is the major obstacle to singing transcription from polyphonic music. We address the issue by using pseudo labels from vocal pitch estimation models given unlabeled data. The proposed method first converts the frame-level pseudo labels to note-level through pitch and rhythm quantization steps. Then, it further improves the label quality through self- training in a teacher-student framework.
-<img src="./img/ICASSP2022-fig1-2.png" width="70%">
-To validate the method, we conduct various experiment settings by investigating two vocal pitch estimation models as pseudo-label generators, two setups of teacher-student frameworks, and the number of iterations in self-training. The results show that the proposed method can effectively leverage large-scale unlabeled audio data and self-training with the noisy student model helps to improve performance. Finally, we show that the model trained with only unlabeled data has comparable performance to previous works and the model trained with addi- tional labeled data achieves higher accuracy than the model trained with only labeled data.
-## Demo video
-- <[Youtube Link 1](https://www.youtube.com/watch?v=wlD-GAGuj0M "Demo 1: Singing transcription from polpyphonic music")> You&I (IU)
-- <[Youtube Link 2](https://youtu.be/iitOC4vuC8U "Demo 2: Singing transcription from polpyphonic music")> You in my arms (Myung jin Moon)
-## Dependencies
-- OS: LINUX
-- Programming language: Python 3.6+
-- Python Library
-  - Keras 2.7.0 (Deep Learning library)
-  - tensorflow 2.5.0 (Deep Learning library)
-  - Librosa 0.8.1 (for STFT)
-  - pydub 0.25.1 (for loading audio and resampling)
-  - pretty-midi (for handling midi data)
-  - Numpy, SciPy
-- Hardware
-  - 1 GPU : GeForce GTX 3090
-## Using STP from the command line
-```
-$ python singing_transcription.py -i ../audio/test.wav  -o ../output
-[optional arguments]
-  -i path_audio           Path to input audio file. (default: '../audio/pop1.wav')
-  -o pathsave             Path to folder for saving .mid file (default: '../output')
-  -ot output_type        (optional) Output type: midi or frame-level pitch score(fps) (default: 'midi')
-```
-- output example: ADC04-pop1.wav
-  <img src="./img/example_pop1_midi.png" width="100%">
-# Citation
-If you find our work useful, please consider citing our paper.
-```
-@inproceedings{kum2022pseudo,
-  title={Pseudo-Label Transfer from Frame-Level to Note-Level in a Teacher-Student Framework for Singing Transcription from Polyphonic Music},
-  author={Sangeun Kum, Jongpil Lee, Keunhyoung Luke Kim, Taehyoung Kim, and Juhan Nam},
-  booktitle={Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  year={2022}
-}
-```

icassp2022_vocal_transcription/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .src import singing_transcription
-transcribe = singing_transcription.get_frame_level_output

icassp2022_vocal_transcription/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (254 Bytes)

icassp2022_vocal_transcription/data/weight_ST.hdf5 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3ba38c046af48a359575c1a312d931966e56d94013ad56dd91f2de5219afa8a4
-size 17535208

icassp2022_vocal_transcription/data/x_train_mean.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f977a72104d19c3b92c764a4fe1335f411ffc331bb6f81ec2420016f07fa772c
-size 4232

icassp2022_vocal_transcription/data/x_train_std.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3a120cbf8bc8e62544f7b0ce1185b0244f3c6971fd50b3092c66a0fda1f5405a
-size 4232

icassp2022_vocal_transcription/img/ICASSP2022-fig1-2.png DELETED Viewed

Binary file (26.9 kB)

icassp2022_vocal_transcription/img/example_pop1_midi.png DELETED Viewed

Binary file (136 kB)

icassp2022_vocal_transcription/requirements.txt DELETED Viewed

@@ -1,8 +0,0 @@
-keras==2.7.0
-numpy==1.19.5
-librosa==0.8.1
-mir-eval==0.6
-pretty-midi==0.2.9
-pydub==0.25.1
-scipy==1.7.3
-tensorflow==2.5.0

icassp2022_vocal_transcription/src/MIDI.py DELETED Viewed

@@ -1,141 +0,0 @@
-#%%
-import pretty_midi
-import numpy as np
-import librosa.display
-#%%
-def plot_piano_roll(pm, start_pitch, end_pitch, fs=100):
-    """ Plot piano roll from .mid file
-    ----------
-    Parameters:
-        pm: RWC, MDB, iKala, DSD100
-        start/end_pitch: lowest/highest note (float)
-        fs: sampling freq. (int)
-    """
-    # Use librosa's specshow function for displaying the piano roll
-    librosa.display.specshow(
-        pm.get_piano_roll(fs)[start_pitch:end_pitch],
-        hop_length=1,
-        sr=fs,
-        x_axis="time",
-        y_axis="cqt_note",
-        fmin=pretty_midi.note_number_to_hz(start_pitch),
-    )
-def midi_to_note(file_name, pitch_shift, fs=100, start_note=40, end_note=95):
-    """ Convert .mid to note
-    ----------
-    Parameters:
-        file_name: '.mid' (str)
-        pitch_sifht: shift the pitch to adjust notes correctly (int)
-        fs: sampling freq. (int)
-        start/end_pitch: lowest/highest note(int)
-    ----------
-    Returns:
-        notes: note/10ms (array)
-    """
-    pm = pretty_midi.PrettyMIDI(file_name)
-    frame_note = pm.get_piano_roll(fs)[start_note:end_note]
-    length_audio = frame_note.shape[1]
-    notes = np.zeros(length_audio)
-    for i in range(length_audio):
-        note_tmp = np.argmax(frame_note[:, i])
-        if note_tmp > 0:
-            notes[i] = (note_tmp + start_note) + pitch_shift
-            # note[i] = 2 ** ((note_tmp -69) / 12.) * 440
-    return notes
-def midi_to_segment(filename):
-    """ Convert .mid to segment
-    ----------
-    Parameters:
-        filename: .mid (str)
-    ----------
-    Returns:
-        segments: [start(s),end(s),pitch] (list)
-    """
-    pm = pretty_midi.PrettyMIDI(filename)
-    segment = []
-    for note in pm.instruments[0].notes:
-        segment.append([note.start, note.end, note.pitch])
-    return segment
-def segment_to_midi(segments, path_output, tempo=120):
-    """ Convert segment to .mid
-    ----------
-    Parameters:
-        segments: [start(s),end(s),pitch] (list)
-        path_output: path of save file (str)
-    """
-    pm = pretty_midi.PrettyMIDI(initial_tempo=int(tempo))
-    inst_program = pretty_midi.instrument_name_to_program("Acoustic Grand Piano")
-    inst = pretty_midi.Instrument(program=inst_program)
-    for segment in segments:
-        note = pretty_midi.Note(
-            velocity=100, start=segment[0], end=segment[1], pitch=np.int(segment[2])
-        )
-        inst.notes.append(note)
-    pm.instruments.append(inst)
-    pm.write(f"{path_output}")
-def note_to_segment(note):
-    """ Convert note to segment
-    ----------
-    Parameters:
-        note: note/10ms (array)
-    ----------
-    Returns:
-        segments: [start(s),end(s),pitch] (list)
-    """
-    startSeg = []
-    endSeg = []
-    notes = []
-    flag = -1
-    if note[0] > 0:
-        startSeg.append(0)
-        notes.append(np.int(note[0]))
-        flag *= -1
-    for i in range(0, len(note) - 1):
-        if note[i] != note[i + 1]:
-            if flag < 0:
-                startSeg.append(0.01 * (i + 1))
-                notes.append(np.int(note[i + 1]))
-                flag *= -1
-            else:
-                if note[i + 1] == 0:
-                    endSeg.append(0.01 * i)
-                    flag *= -1
-                else:
-                    endSeg.append(0.01 * i)
-                    startSeg.append(0.01 * (i + 1))
-                    notes.append(np.int(note[i + 1]))
-    return list(zip(startSeg, endSeg, notes))
-def note2Midi(frame_level_pitchscroe, path_output, tempo):
-    # note = np.loadtxt(path_input_note)
-    # note = note[:, 1]
-    segment = note_to_segment(frame_level_pitchscroe)
-    segment_to_midi(segment, path_output=path_output, tempo=tempo)
-# def note2Midi(path_input_note, path_output, tempo):
-#     note = np.loadtxt(path_input_note)
-#     note = note[:, 1]
-#     segment = note_to_segment(note)
-#     segment_to_midi(segment, path_output=path_output, tempo=tempo)

icassp2022_vocal_transcription/src/__init__.py DELETED Viewed

File without changes

icassp2022_vocal_transcription/src/__pycache__/MIDI.cpython-38.pyc DELETED Viewed

Binary file (3.48 kB)

icassp2022_vocal_transcription/src/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (165 Bytes)

icassp2022_vocal_transcription/src/__pycache__/featureExtraction.cpython-38.pyc DELETED Viewed

Binary file (1.74 kB)

icassp2022_vocal_transcription/src/__pycache__/model.cpython-38.pyc DELETED Viewed

Binary file (3.1 kB)

icassp2022_vocal_transcription/src/__pycache__/quantization.cpython-38.pyc DELETED Viewed

Binary file (4.92 kB)

icassp2022_vocal_transcription/src/__pycache__/singing_transcription.cpython-38.pyc DELETED Viewed

Binary file (3.99 kB)

icassp2022_vocal_transcription/src/__pycache__/utils.cpython-38.pyc DELETED Viewed

Binary file (1.5 kB)

icassp2022_vocal_transcription/src/featureExtraction.py DELETED Viewed

@@ -1,61 +0,0 @@
-# -*- coding: utf-8 -*-
-import librosa
-from pydub import AudioSegment
-import pathlib
-# from pydub.playback import play
-import numpy as np
-import os
-PATH_PROJECT = os.path.dirname(os.path.realpath(__file__))
-def read_audio(filepath, sr=None):
-    path = pathlib.Path(filepath)
-    extenstion = path.suffix.replace(".", "")
-    if extenstion == "mp3":
-        sound = AudioSegment.from_mp3(filepath)
-    else:
-        sound = AudioSegment.from_file(filepath)
-    # sound = sound[start * 1000 : end * 1000]
-    sound = sound.set_channels(1)
-    if sr == None:
-        sr = sound.frame_rate
-    sound = sound.set_frame_rate(sr)
-    samples = sound.get_array_of_samples()
-    y = np.array(samples).T.astype(np.float32)
-    return y, sr
-def spec_extraction(file_name, win_size):
-    y, _ = read_audio(file_name, sr=8000)
-    S = librosa.core.stft(y, n_fft=1024, hop_length=80, win_length=1024)
-    x_spec = np.abs(S)
-    x_spec = librosa.core.power_to_db(x_spec, ref=np.max)
-    x_spec = x_spec.astype(np.float32)
-    num_frames = x_spec.shape[1]
-    # for padding
-    padNum = num_frames % win_size
-    if padNum != 0:
-        len_pad = win_size - padNum
-        padding_feature = np.zeros(shape=(513, len_pad))
-        x_spec = np.concatenate((x_spec, padding_feature), axis=1)
-        num_frames = num_frames + len_pad
-    x_test = []
-    for j in range(0, num_frames, win_size):
-        x_test_tmp = x_spec[:, range(j, j + win_size)].T
-        x_test.append(x_test_tmp)
-    x_test = np.array(x_test)
-    # for standardization
-    path_project = pathlib.Path(__file__).parent.parent
-    x_train_mean = np.load(f"{path_project}/data/x_train_mean.npy")
-    x_train_std = np.load(f"{path_project}/data/x_train_std.npy")
-    x_test = (x_test - x_train_mean) / (x_train_std + 0.0001)
-    x_test = x_test[:, :, :, np.newaxis]
-    return x_test, x_spec

icassp2022_vocal_transcription/src/model.py DELETED Viewed

@@ -1,139 +0,0 @@
-# import keras.backend as KK
-import math
-from tensorflow.keras import backend as K
-from tensorflow.keras.regularizers import l2
-from tensorflow.keras.models import Model
-from tensorflow.keras.layers import (
-    Conv2D,
-    MaxPooling2D,
-    BatchNormalization,
-    LeakyReLU,
-    Dropout,
-    LSTM,
-    Reshape,
-    Bidirectional,
-    TimeDistributed,
-    Input,
-    add,
-    concatenate,
-    Lambda,
-    Dense,
-    Activation,
-)
-# --------------------------------------------------------------------------------
-def ResNet_Block(input, block_id, filterNum):
-    x = Conv2D(
-        filterNum,
-        (1, 1),
-        name="conv_s" + str(block_id) + "_1x1",
-        padding="same",
-        kernel_initializer="he_normal",
-        use_bias=False,
-    )(input)
-    shortcut = BatchNormalization()(x)
-    x = LeakyReLU(0.01)(shortcut)
-    x = Conv2D(
-        filterNum,
-        (3, 3),
-        name="conv" + str(block_id) + "_1",
-        padding="same",
-        kernel_initializer="he_normal",
-        use_bias=False,
-        kernel_regularizer=l2(1e-5),
-    )(x)
-    x = BatchNormalization()(x)
-    x = LeakyReLU(0.01)(x)
-    # x = Dropout(0.3)(x)
-    x = Conv2D(
-        filterNum,
-        (3, 3),
-        name="conv" + str(block_id) + "_2",
-        padding="same",
-        kernel_initializer="he_normal",
-        use_bias=False,
-        kernel_regularizer=l2(1e-5),
-    )(x)
-    x = BatchNormalization()(x)
-    x = LeakyReLU(0.01)(x)
-    x = Conv2D(
-        filterNum,
-        (1, 1),
-        name="conv_f" + str(block_id) + "_1x1",
-        padding="same",
-        kernel_initializer="he_normal",
-        use_bias=False,
-    )(x)
-    x = BatchNormalization()(x)
-    x = add([x, shortcut])
-    x = LeakyReLU(0.01)(x)
-    x = MaxPooling2D((1, 4))(x)
-    return x
-def melody_ResNet_JDC(num_spec, window_size, note_res):
-    num_output = int(55 * 2 ** (math.log(note_res, 2)) + 2)
-    input = Input(shape=(window_size, num_spec, 1))
-    block_1 = ResNet_Block(input=input, block_id=1, filterNum=64)
-    block_2 = ResNet_Block(input=block_1, block_id=2, filterNum=128)
-    block_3 = ResNet_Block(input=block_2, block_id=3, filterNum=192)
-    block_4 = ResNet_Block(input=block_3, block_id=4, filterNum=256)
-    block_4_dp = Dropout(0.3)(block_4)
-    keras_shape = K.int_shape(block_4)
-    numOutput_P = keras_shape[2] * keras_shape[3]
-    output_tmp = Reshape((window_size, numOutput_P))(block_4_dp)
-    # voicing
-    block_1 = MaxPooling2D((1, 4 ** 3))(block_1)
-    block_2 = MaxPooling2D((1, 4 ** 2))(block_2)
-    block_3 = MaxPooling2D((1, 4 ** 1))(block_3)
-    joint = concatenate([block_1, block_2, block_3, block_4])
-    joint = Dropout(0.3)(joint)
-    joint = Conv2D(
-        256,
-        (1, 1),
-        padding="same",
-        kernel_initializer="he_normal",
-        use_bias=False,
-        kernel_regularizer=l2(1e-5),
-    )(joint)
-    joint = BatchNormalization()(joint)
-    joint = LeakyReLU(0.01)(joint)
-    keras_shape2 = K.int_shape(joint)
-    num_V = keras_shape2[2] * keras_shape2[3]
-    output_V_tmp = Reshape((window_size, num_V))(joint)
-    output_V_tmp = Bidirectional(LSTM(32, return_sequences=True, stateful=False, dropout=0.2))(
-        output_V_tmp
-    )
-    output_V = TimeDistributed(Dense(2))(output_V_tmp)
-    output_V = TimeDistributed(Activation("softmax"), name="output_AUX_V")(output_V)
-    # output
-    output_tmp = Bidirectional(LSTM(256, return_sequences=True, dropout=0.2))(output_tmp)
-    output_tmp = concatenate([output_tmp, output_V_tmp])
-    output = TimeDistributed(Dense(num_output))(output_tmp)
-    output = TimeDistributed(Activation("softmax"), name="output")(output)
-    output_NS = Lambda(lambda x: x[:, :, 0])(output)
-    output_NS = Reshape((window_size, 1))(output_NS)
-    output_S = Lambda(lambda x: 1 - x[:, :, 0])(output)
-    output_S = Reshape((window_size, 1))(output_S)
-    output_PV = concatenate([output_NS, output_S])
-    output_V_F = concatenate([output_V, output_PV])
-    output_V_F = TimeDistributed(Dense(2))(output_V_F)
-    output_V_F = TimeDistributed(Activation("softmax"), name="output_V")(output_V_F)
-    model = Model(inputs=input, outputs=[output, output_V_F])
-    return model

icassp2022_vocal_transcription/src/quantization.py DELETED Viewed

@@ -1,217 +0,0 @@
-# %%
-import numpy as np
-import librosa
-import librosa.display
-from scipy.signal import medfilt
-from matplotlib import pyplot as plt
-from .featureExtraction import read_audio
-from .utils import *
-# %%
-def calc_tempo(path_audio):
-    """ Calculate audio tempo
-    ----------
-    Parameters:
-        path_audio: str
-    ----------
-    Returns:
-        tempo: float
-    """
-    target_sr = 22050
-    y, _ = read_audio(path_audio, sr=target_sr)
-    onset_strength = librosa.onset.onset_strength(y, sr=target_sr)
-    tempo = librosa.beat.tempo(onset_envelope=onset_strength, sr=target_sr)
-    return tempo
-def one_beat_frame_size(tempo):
-    """ Calculate frame size of 1 beat
-    ----------
-    Parameters:
-        tempo: float
-    ----------
-    Returns:
-        tempo: int
-    """
-    return np.int(np.round(60 / tempo * 100))
-def median_filter_pitch(pitch, medfilt_size, weight):
-    """ Smoothing pitch using median filter
-    ----------
-    Parameters:
-        pitch: array
-        medfilt_size: int
-        weight: float
-    ----------
-    Returns:
-        pitch: array
-    """
-    medfilt_size = np.int(medfilt_size * weight)
-    if medfilt_size % 2 == 0:
-        medfilt_size += 1
-    return np.round(medfilt(pitch, medfilt_size))
-def clean_note_frames(note, min_note_len=5):
-    """ Remove short pitch frames
-    ----------
-    Parameters:
-        note: array
-        min_note_len: int
-    ----------
-    Returns:
-        output: array
-    """
-    prev_pitch = 0
-    prev_pitch_start = 0
-    output = np.copy(note)
-    for i in range(len(note)):
-        pitch = note[i]
-        if pitch != prev_pitch:
-            prev_pitch_duration = i - prev_pitch_start
-            if prev_pitch_duration < min_note_len:
-                output[prev_pitch_start:i] = [0] * prev_pitch_duration
-            prev_pitch = pitch
-            prev_pitch_start = i
-    return output
-def makeSegments(note):
-    """ Make segments of notes
-    ----------
-    Parameters:
-        note: array
-    ----------
-    Returns:
-        startSeg: starting points (array)
-        endSeg: ending points (array)
-    """
-    startSeg = []
-    endSeg = []
-    flag = -1
-    if note[0] > 0:
-        startSeg.append(0)
-        flag *= -1
-    for i in range(0, len(note) - 1):
-        if note[i] != note[i + 1]:
-            if flag < 0:
-                startSeg.append(i + 1)
-                flag *= -1
-            else:
-                if note[i + 1] == 0:
-                    endSeg.append(i)
-                    flag *= -1
-                else:
-                    endSeg.append(i)
-                    startSeg.append(i + 1)
-    return startSeg, endSeg
-def remove_short_segment(idx, note_cleaned, start, end, minLength):
-    """ Remove short segments
-    ----------
-    Parameters:
-        idx: (int)
-        note_cleaned: (array)
-        start: starting points (array)
-        end: ending points (array)
-        minLength: (int)
-    ----------
-    Returns:
-        note_cleaned: (array)
-    """
-    len_seg = end[idx] - start[idx]
-    if len_seg < minLength:
-        if (start[idx + 1] - end[idx] > minLength) and (start[idx] - end[idx - 1] > minLength):
-            note_cleaned[start[idx] : end[idx] + 1] = [0] * (len_seg + 1)
-    return note_cleaned
-def remove_octave_error(idx, note_cleaned, start, end):
-    """ Remove octave error
-    ----------
-    Parameters:
-        idx: (int)
-        note_cleaned: (array)
-        start: starting points (array)
-        end: ending points (array)
-    ----------
-    Returns:
-        note_cleaned: (array)
-    """
-    len_seg = end[idx] - start[idx]
-    if (note_cleaned[start[idx - 1]] == note_cleaned[start[idx + 1]]) and (
-        note_cleaned[start[idx]] != note_cleaned[start[idx + 1]]
-    ):
-        if np.abs(note_cleaned[start[idx]] - note_cleaned[start[idx + 1]]) % 12 == 0:
-            note_cleaned[start[idx] - 1 : end[idx] + 1] = [note_cleaned[start[idx + 1]]] * (
-                len_seg + 2
-            )
-    return note_cleaned
-def clean_segment(note, minLength):
-    """ clean note segments
-    ----------
-    Parameters:
-        note: (array)
-        minLength: (int)
-    ----------
-    Returns:
-        note_cleaned: (array)
-    """
-    note_cleaned = np.copy(note)
-    start, end = makeSegments(note_cleaned)
-    for i in range(1, len(start) - 1):
-        note_cleaned = remove_short_segment(i, note_cleaned, start, end, minLength)
-        note_cleaned = remove_octave_error(i, note_cleaned, start, end)
-    return note_cleaned
-def refine_note(est_note, tempo):
-    """ main: refine note segments
-    ----------
-    Parameters:
-        est_note: (array)
-        tempo: (float)
-    ----------
-    Returns:
-        est_pitch_mf3_v: (array)
-    """
-    one_beat_size = one_beat_frame_size(tempo)
-    est_note_mf1 = median_filter_pitch(est_note, one_beat_size, 1 / 8)
-    est_note_mf2 = median_filter_pitch(est_note_mf1, one_beat_size, 1 / 4)
-    est_note_mf3 = median_filter_pitch(est_note_mf2, one_beat_size, 1 / 3)
-    vocing = est_note_mf1 > 0
-    est_pitch_mf3_v = vocing * est_note_mf3
-    est_pitch_mf3_v = clean_note_frames(est_pitch_mf3_v, int(one_beat_size * 1 / 8))
-    est_pitch_mf3_v = clean_segment(est_pitch_mf3_v, int(one_beat_size * 1 / 4))
-    return est_pitch_mf3_v

icassp2022_vocal_transcription/src/singing_transcription.py DELETED Viewed

@@ -1,147 +0,0 @@
-# -*- coding: utf-8 -*-
-# %%
-import argparse
-import numpy as np
-from pathlib import Path
-from .model import *
-from .featureExtraction import *
-from .quantization import *
-from .utils import *
-from .MIDI import *
-# %%
-class SingingTranscription:
-    def __init__(self):
-        self.PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
-        self.num_spec = 513
-        self.window_size = 31
-        self.note_res = 1
-        self.batch_size = 64
-    def load_model(self, path_weight, TF_summary=False):
-        model = melody_ResNet_JDC(self.num_spec, self.window_size, self.note_res)
-        model.load_weights(path_weight)
-        if TF_summary == True:
-            print(model.summary())
-        return model
-    def predict_melody(self, model_ST, filepath):
-        pitch_range = np.arange(40, 95 + 1.0 / self.note_res, 1.0 / self.note_res)
-        pitch_range = np.concatenate([np.zeros(1), pitch_range])
-        """  Features extraction"""
-        X_test, _ = spec_extraction(file_name=filepath, win_size=self.window_size)
-        """  melody predict"""
-        y_predict = model_ST.predict(X_test, batch_size=self.batch_size, verbose=1)
-        y_predict = y_predict[0]  # [0]:note,  [1]:vocing
-        y_shape = y_predict.shape
-        num_total = y_shape[0] * y_shape[1]
-        y_predict = np.reshape(y_predict, (num_total, y_shape[2]))
-        est_MIDI = np.zeros(num_total)
-        est_freq = np.zeros(num_total)
-        for i in range(num_total):
-            index_predict = np.argmax(y_predict[i])
-            pitch_MIDI = pitch_range[np.int32(index_predict)]
-            if pitch_MIDI >= 40 and pitch_MIDI <= 95:
-                est_MIDI[i] = pitch_MIDI
-                # est_freq[i] = 2 ** ((pitch_MIDI - 69) / 12.0) * 440
-        return est_MIDI
-    def save_output_frame_level(self, pitch_score, path_save, note_or_freq="note"):
-        check_and_make_dir(Path(path_save))
-        f = open(path_save, "w")
-        assert (note_or_freq == "freq") or (note_or_freq == "note"), "please check 'note' or 'freq"
-        if note_or_freq == "freq":
-            for j in range(len(pitch_score)):
-                if pitch_score[j] > 0:
-                    pitch_score[j] = 2 ** ((pitch_score[j] - 69) / 12.0) * 440
-                est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
-                f.write(est)
-        elif note_or_freq == "note":
-            for j in range(len(pitch_score)):
-                est = "%.2f %.4f\n" % (0.01 * j, pitch_score[j])
-                f.write(est)
-        f.close()
-def main(args):
-    ST = SingingTranscription()
-    """ load model """
-    model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
-    """ predict note (time-freq) """
-    path_audio = args.path_audio
-    fl_note = ST.predict_melody(model_ST, path_audio)  # frame-level pitch score
-    """ post-processing """
-    tempo = calc_tempo(path_audio)
-    refined_fl_note = refine_note(fl_note, tempo)  # frame-level pitch score
-    """ convert frame-level pitch score to note-level (time-axis) """
-    segment = note_to_segment(refined_fl_note)  # note-level pitch score
-    """ save ouput to .mid """
-    filename = get_filename_wo_extension(path_audio)
-    path_output = f"{args.path_save}/{filename}.mid"
-    segment_to_midi(segment, path_output=path_output, tempo=tempo)
-    if args.output_type == "fps":
-        path_note = f"{args.path_save}/{filename}.txt"
-        ST.save_output_frame_level(refined_fl_note, path_note, note_or_freq="freq")
-    print(f"\n========= DONE =========")
-    print(f"input: '{path_audio}'")
-    print(f"output: '{path_output}'")
-ST = SingingTranscription()
-""" load model """
-model_ST = ST.load_model(f"{ST.PATH_PROJECT}/data/weight_ST.hdf5", TF_summary=False)
-def get_frame_level_output(wav_path):
-    """ predict note (time-freq) """
-    path_audio = wav_path
-    fl_note = ST.predict_melody(model_ST, path_audio)  # frame-level pitch score
-    """ post-processing """
-    tempo = calc_tempo(path_audio)
-    refined_fl_note = refine_note(fl_note, tempo)  # frame-level pitch score
-    return refined_fl_note.astype(int)
-# %%
-if __name__ == "__main__":
-    PATH_PROJECT = pathlib.Path(__file__).absolute().parent.parent
-    parser = argparse.ArgumentParser(description="Predict singing transcription")
-    parser.add_argument(
-        "-i",
-        "--path_audio",
-        type=str,
-        help="Path to input audio file.",
-        default=f"{PATH_PROJECT}/audio/pop1.wav",
-    )
-    parser.add_argument(
-        "-o",
-        "--path_save",
-        type=str,
-        help="Path to folder for saving .mid file",
-        default=f"{PATH_PROJECT}/output",
-    )
-    parser.add_argument(
-        "-ot",
-        "--output_type",
-        type=str,
-        help="(optional) Output type: midi or frame-level pitch score(fps)",
-        default="midi",
-    )
-    main(parser.parse_args())

icassp2022_vocal_transcription/src/utils.py DELETED Viewed

@@ -1,49 +0,0 @@
-import os
-import numpy as np
-from pydub import AudioSegment
-import pathlib
-def check_and_make_dir(path_dir):
-    if not os.path.exists(os.path.dirname(path_dir)):
-        os.makedirs(os.path.dirname(path_dir))
-def get_filename_wo_extension(path_dir):
-    return pathlib.Path(path_dir).stem
-def note2pitch(pitch):
-    """ Convert MIDI number to freq.
-    ----------
-    Parameters:
-        pitch: MIDI note numbers of pitch (array)
-    ----------
-    Returns:
-        pitch: freqeuncy of pitch (array)
-    """
-    pitch = np.array(pitch)
-    pitch[pitch > 0] = 2 ** ((pitch[pitch > 0] - 69) / 12.0) * 440
-    return pitch
-def pitch2note(pitch):
-    """ Convert freq to MIDI number
-    ----------
-    Parameters:
-        pitch: freqeuncy of pitch (array)
-    ----------
-    Returns:
-        pitch: MIDI note numbers of pitch (array)
-    """
-    pitch = np.array(pitch)
-    pitch[pitch > 0] = np.round((69.0 + 12.0 * np.log2(pitch[pitch > 0] / 440.0)))
-    return pitch
-a = np.array([0, 0, 0, 1, 2, 3, 5, 0, 0, 0, 1, 2, 4, 5])
-b = a[a > 0] * 2
-print(b)

infer_tool.py CHANGED Viewed

@@ -1,100 +1,175 @@
 import os
 import numpy as np
-import soundfile
 import torch
 import torchaudio
-from pydub import AudioSegment
-dev = torch.device("cpu")
-def get_units(path, hubert_soft):
-    source, sr = torchaudio.load(path)
     source = torchaudio.functional.resample(source, sr, 16000)
     source = source.unsqueeze(0).to(dev)
     with torch.inference_mode():
         units = hubert_soft.units(source)
         return units
-def transcribe(path, length, transform, feature_input):
-    feature_pit = feature_input.compute_f0(path)
     feature_pit = feature_pit * 2 ** (transform / 12)
     feature_pit = resize2d_f0(feature_pit, length)
     coarse_pit = feature_input.coarse_f0(feature_pit)
     return coarse_pit
-def resize2d_plus(source, target_len):
-    source = source.astype(float)
-    source[source < 0.001] = np.nan
-    target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
-                       source)
-    res = np.nan_to_num(target)
-    ret = res[:].astype(int)
-    # 若调整大小时采样到中间的点，则以上一个点作为当前音高值
-    for i in range(len(res)):
-        if res[i] - ret[i] > 0.001:
-            ret[i] = ret[i - 1]
-    return ret
-def infer(file_name, speaker_id, tran, target_sample, net_g_ms, hubert_soft, feature_input):
-    source_path = "./wav_temp/input/" + file_name
-    audio, sample_rate = torchaudio.load(source_path)
-    input_size = audio.shape[-1]
     sid = torch.LongTensor([int(speaker_id)]).to(dev)
-    soft = get_units(source_path, hubert_soft).squeeze(0).cpu().numpy()
-    pitch = transcribe(source_path, soft.shape[0], tran, feature_input)
-    pitch = torch.LongTensor(pitch).unsqueeze(0).to(dev)
     stn_tst = torch.FloatTensor(soft)
     with torch.no_grad():
         x_tst = stn_tst.unsqueeze(0).to(dev)
         x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
         audio = \
-            net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=.3, noise_scale_w=0.5,
                            length_scale=1)[0][
                 0, 0].data.float().cpu().numpy()
-    soundfile.write("./wav_temp/output/" + file_name, audio,
-                    int(audio.shape[0] / input_size * target_sample))
-def resize2d_f0(x, target_len):
-    source = np.array(x)
-    source[source < 0.001] = np.nan
-    target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
-                       source)
-    res = np.nan_to_num(target)
-    return res
-# python删除文件的方法 os.remove(path)path指的是文件的绝对路径,如：
-def del_file(path_data):
-    for i in os.listdir(path_data):  # os.listdir(path_data)#返回一个列表，里面是当前目录下面的所有东西的相对路径
-        os.remove(path_data + i)
-def cut(c_time, file_path, vocal_name, out_dir):
-    audio_segment = AudioSegment.from_file(file_path, format='wav')
-    total = int(audio_segment.duration_seconds / c_time)  # 计算音频切片后的个数
-    for i in range(total):
-        # 将音频10s切片，并以顺序进行命名
-        audio_segment[i * c_time * 1000:(i + 1) * c_time * 1000].export(f"{out_dir}/{vocal_name}-{i}.wav",
-                                                                        format="wav")
-    audio_segment[total * c_time * 1000:].export(f"{out_dir}/{vocal_name}-{total}.wav", format="wav")  # 缺少结尾的音频片段
-def wav_resample(audio_path, tar_sample):
     raw_audio, raw_sample_rate = torchaudio.load(audio_path)
-    audio_22050 = torchaudio.transforms.Resample(orig_freq=raw_sample_rate, new_freq=tar_sample)(raw_audio)[0]
-    soundfile.write(audio_path, audio_22050, tar_sample)
-    return audio_path
 def fill_a_to_b(a, b):
     if len(a) < len(b):
         for _ in range(0, len(b) - len(a)):
             a.append(a[0])

+import logging
 import os
+import time
+import matplotlib.pyplot as plt
 import numpy as np
 import torch
 import torchaudio
+import hubert_model
+import utils
+from models import SynthesizerTrn
+from preprocess_wave import FeatureInput
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def timeit(func):
+    def run(*args, **kwargs):
+        t = time.time()
+        res = func(*args, **kwargs)
+        print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
+        return res
+    return run
+def get_end_file(dir_path, end):
+    file_lists = []
+    for root, dirs, files in os.walk(dir_path):
+        files = [f for f in files if f[0] != '.']
+        dirs[:] = [d for d in dirs if d[0] != '.']
+        for f_file in files:
+            if f_file.endswith(end):
+                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
+    return file_lists
+def load_model(model_path, config_path):
+    # 获取模型配置
+    hps_ms = utils.get_hparams_from_file(config_path)
+    n_g_ms = SynthesizerTrn(
+        178,
+        hps_ms.data.filter_length // 2 + 1,
+        hps_ms.train.segment_size // hps_ms.data.hop_length,
+        n_speakers=hps_ms.data.n_speakers,
+        **hps_ms.model)
+    _ = utils.load_checkpoint(model_path, n_g_ms, None)
+    _ = n_g_ms.eval().to(dev)
+    # 加载hubert
+    hubert_soft = hubert_model.hubert_soft(get_end_file("./", "pt")[0])
+    feature_input = FeatureInput(hps_ms.data.sampling_rate, hps_ms.data.hop_length)
+    return n_g_ms, hubert_soft, feature_input, hps_ms
+def resize2d_f0(x, target_len):
+    source = np.array(x)
+    source[source < 0.001] = np.nan
+    target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
+                       source)
+    res = np.nan_to_num(target)
+    return res
+def get_units(in_path, hubert_soft):
+    source, sr = torchaudio.load(in_path)
     source = torchaudio.functional.resample(source, sr, 16000)
+    if len(source.shape) == 2 and source.shape[1] >= 2:
+        source = torch.mean(source, dim=0).unsqueeze(0)
     source = source.unsqueeze(0).to(dev)
     with torch.inference_mode():
         units = hubert_soft.units(source)
         return units
+def transcribe(source_path, length, transform, feature_input):
+    feature_pit = feature_input.compute_f0(source_path)
     feature_pit = feature_pit * 2 ** (transform / 12)
     feature_pit = resize2d_f0(feature_pit, length)
     coarse_pit = feature_input.coarse_f0(feature_pit)
     return coarse_pit
+def get_unit_pitch(in_path, tran, hubert_soft, feature_input):
+    soft = get_units(in_path, hubert_soft).squeeze(0).cpu().numpy()
+    input_pitch = transcribe(in_path, soft.shape[0], tran, feature_input)
+    return soft, input_pitch
+def clean_pitch(input_pitch):
+    num_nan = np.sum(input_pitch == 1)
+    if num_nan / len(input_pitch) > 0.9:
+        input_pitch[input_pitch != 1] = 1
+    return input_pitch
+def plt_pitch(input_pitch):
+    input_pitch = input_pitch.astype(float)
+    input_pitch[input_pitch == 1] = np.nan
+    return input_pitch
+def f0_to_pitch(ff):
+    f0_pitch = 69 + 12 * np.log2(ff / 440)
+    return f0_pitch
+def f0_plt(in_path, out_path, tran, hubert_soft, feature_input):
+    s1, input_pitch = get_unit_pitch(in_path, tran, hubert_soft, feature_input)
+    s2, output_pitch = get_unit_pitch(out_path, 0, hubert_soft, feature_input)
+    plt.clf()
+    plt.plot(plt_pitch(input_pitch), color="#66ccff")
+    plt.plot(plt_pitch(output_pitch), color="orange")
+    plt.savefig("temp.jpg")
+def calc_error(in_path, out_path, tran, feature_input):
+    input_pitch = feature_input.compute_f0(in_path)
+    output_pitch = feature_input.compute_f0(out_path)
+    sum_y = []
+    if np.sum(input_pitch == 0) / len(input_pitch) > 0.9:
+        mistake, var_take = 0, 0
+    else:
+        for i in range(min(len(input_pitch), len(output_pitch))):
+            if input_pitch[i] > 0 and output_pitch[i] > 0:
+                sum_y.append(abs(f0_to_pitch(output_pitch[i]) - (f0_to_pitch(input_pitch[i]) + tran)))
+        num_y = 0
+        for x in sum_y:
+            num_y += x
+        len_y = len(sum_y) if len(sum_y) else 1
+        mistake = round(float(num_y / len_y), 2)
+        var_take = round(float(np.std(sum_y, ddof=1)), 2)
+    return mistake, var_take
+def infer(source_path, speaker_id, tran, net_g_ms, hubert_soft, feature_input):
     sid = torch.LongTensor([int(speaker_id)]).to(dev)
+    soft, pitch = get_unit_pitch(source_path, tran, hubert_soft, feature_input)
+    pitch = torch.LongTensor(clean_pitch(pitch)).unsqueeze(0).to(dev)
     stn_tst = torch.FloatTensor(soft)
     with torch.no_grad():
         x_tst = stn_tst.unsqueeze(0).to(dev)
         x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
         audio = \
+            net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=0.3, noise_scale_w=0.5,
                            length_scale=1)[0][
                 0, 0].data.float().cpu().numpy()
+    return audio, audio.shape[-1]
+def del_temp_wav(path_data):
+    for i in get_end_file(path_data, "wav"):  # os.listdir(path_data)#返回一个列表，里面是当前目录下面的所有东西的相对路径
+        os.remove(i)
+def format_wav(audio_path, tar_sample):
     raw_audio, raw_sample_rate = torchaudio.load(audio_path)
+    if len(raw_audio.shape) == 2 and raw_audio.shape[1] >= 2:
+        raw_audio = torch.mean(raw_audio, dim=0).unsqueeze(0)
+    tar_audio = torchaudio.functional.resample(raw_audio, raw_sample_rate, tar_sample)
+    torchaudio.save(audio_path[:-4] + ".wav", tar_audio, tar_sample)
+    return tar_audio, tar_sample
 def fill_a_to_b(a, b):
     if len(a) < len(b):
         for _ in range(0, len(b) - len(a)):
             a.append(a[0])
+def mkdir(paths: list):
+    for path in paths:
+        if not os.path.exists(path):
+            os.mkdir(path)

models.py CHANGED Viewed

@@ -1,15 +1,15 @@
-import copy
 import math
 import torch
 from torch import nn
 from torch.nn import functional as F
-import numpy as np
 import commons
 import modules
-import attentions
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from commons import init_weights, get_padding
@@ -189,7 +189,7 @@ class TextEncoder(nn.Module):
         # self.emb = nn.Embedding(n_vocab, hidden_channels)
         # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
-        self.emb_pitch = nn.Embedding(128, hidden_channels)
         nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5)
         self.encoder = attentions.Encoder(
@@ -491,8 +491,8 @@ class SynthesizerTrn(nn.Module):
         self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
                                       gin_channels=gin_channels)
         self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
-        self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers,
-                                        kernel_size, p_dropout)
         if use_sdp:
             self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
@@ -504,12 +504,6 @@ class SynthesizerTrn(nn.Module):
     def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
         x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
-        pred_pitch, pitch_embedding = self.pitch_net(x, x_mask)
-        x = x + pitch_embedding
-        # print(pred_pitch)
-        gt_lf0 = torch.log(440 * (2 ** ((pitch - 69) / 12)))
-        # print(gt_lf0)
         if self.n_speakers > 0:
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:

 import math
+import math
 import torch
 from torch import nn
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 from torch.nn import functional as F
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+import attentions
 import commons
 import modules
 from commons import init_weights, get_padding
         # self.emb = nn.Embedding(n_vocab, hidden_channels)
         # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+        self.emb_pitch = nn.Embedding(256, hidden_channels)
         nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5)
         self.encoder = attentions.Encoder(
         self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
                                       gin_channels=gin_channels)
         self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+        # self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers,
+        #                                 kernel_size, p_dropout)
         if use_sdp:
             self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
     def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
         x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
         if self.n_speakers > 0:
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:

modules.py CHANGED Viewed

@@ -1,187 +1,184 @@
-import copy
 import math
-import numpy as np
-import scipy
 import torch
 from torch import nn
-from torch.nn import functional as F
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm
 import commons
 from commons import init_weights, get_padding
 from transforms import piecewise_rational_quadratic_transform
 LRELU_SLOPE = 0.1
 class LayerNorm(nn.Module):
-  def __init__(self, channels, eps=1e-5):
-    super().__init__()
-    self.channels = channels
-    self.eps = eps
-    self.gamma = nn.Parameter(torch.ones(channels))
-    self.beta = nn.Parameter(torch.zeros(channels))
-  def forward(self, x):
-    x = x.transpose(1, -1)
-    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
-    return x.transpose(1, -1)
 class ConvReluNorm(nn.Module):
-  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
-    super().__init__()
-    self.in_channels = in_channels
-    self.hidden_channels = hidden_channels
-    self.out_channels = out_channels
-    self.kernel_size = kernel_size
-    self.n_layers = n_layers
-    self.p_dropout = p_dropout
-    assert n_layers > 1, "Number of layers should be larger than 0."
-    self.conv_layers = nn.ModuleList()
-    self.norm_layers = nn.ModuleList()
-    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
-    self.norm_layers.append(LayerNorm(hidden_channels))
-    self.relu_drop = nn.Sequential(
-        nn.ReLU(),
-        nn.Dropout(p_dropout))
-    for _ in range(n_layers-1):
-      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
-      self.norm_layers.append(LayerNorm(hidden_channels))
-    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-    self.proj.weight.data.zero_()
-    self.proj.bias.data.zero_()
-  def forward(self, x, x_mask):
-    x_org = x
-    for i in range(self.n_layers):
-      x = self.conv_layers[i](x * x_mask)
-      x = self.norm_layers[i](x)
-      x = self.relu_drop(x)
-    x = x_org + self.proj(x)
-    return x * x_mask
 class DDSConv(nn.Module):
-  """
-  Dialted and Depth-Separable Convolution
-  """
-  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
-    super().__init__()
-    self.channels = channels
-    self.kernel_size = kernel_size
-    self.n_layers = n_layers
-    self.p_dropout = p_dropout
-    self.drop = nn.Dropout(p_dropout)
-    self.convs_sep = nn.ModuleList()
-    self.convs_1x1 = nn.ModuleList()
-    self.norms_1 = nn.ModuleList()
-    self.norms_2 = nn.ModuleList()
-    for i in range(n_layers):
-      dilation = kernel_size ** i
-      padding = (kernel_size * dilation - dilation) // 2
-      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
-          groups=channels, dilation=dilation, padding=padding
-      ))
-      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
-      self.norms_1.append(LayerNorm(channels))
-      self.norms_2.append(LayerNorm(channels))
-  def forward(self, x, x_mask, g=None):
-    if g is not None:
-      x = x + g
-    for i in range(self.n_layers):
-      y = self.convs_sep[i](x * x_mask)
-      y = self.norms_1[i](y)
-      y = F.gelu(y)
-      y = self.convs_1x1[i](y)
-      y = self.norms_2[i](y)
-      y = F.gelu(y)
-      y = self.drop(y)
-      x = x + y
-    return x * x_mask
 class WN(torch.nn.Module):
-  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-    super(WN, self).__init__()
-    assert(kernel_size % 2 == 1)
-    self.hidden_channels =hidden_channels
-    self.kernel_size = kernel_size,
-    self.dilation_rate = dilation_rate
-    self.n_layers = n_layers
-    self.gin_channels = gin_channels
-    self.p_dropout = p_dropout
-    self.in_layers = torch.nn.ModuleList()
-    self.res_skip_layers = torch.nn.ModuleList()
-    self.drop = nn.Dropout(p_dropout)
-    if gin_channels != 0:
-      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
-      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
-    for i in range(n_layers):
-      dilation = dilation_rate ** i
-      padding = int((kernel_size * dilation - dilation) / 2)
-      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
-                                 dilation=dilation, padding=padding)
-      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
-      self.in_layers.append(in_layer)
-      # last one is not necessary
-      if i < n_layers - 1:
-        res_skip_channels = 2 * hidden_channels
-      else:
-        res_skip_channels = hidden_channels
-      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
-      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
-      self.res_skip_layers.append(res_skip_layer)
-  def forward(self, x, x_mask, g=None, **kwargs):
-    output = torch.zeros_like(x)
-    n_channels_tensor = torch.IntTensor([self.hidden_channels])
-    if g is not None:
-      g = self.cond_layer(g)
-    for i in range(self.n_layers):
-      x_in = self.in_layers[i](x)
-      if g is not None:
-        cond_offset = i * 2 * self.hidden_channels
-        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
-      else:
-        g_l = torch.zeros_like(x_in)
-      acts = commons.fused_add_tanh_sigmoid_multiply(
-          x_in,
-          g_l,
-          n_channels_tensor)
-      acts = self.drop(acts)
-      res_skip_acts = self.res_skip_layers[i](acts)
-      if i < self.n_layers - 1:
-        res_acts = res_skip_acts[:,:self.hidden_channels,:]
-        x = (x + res_acts) * x_mask
-        output = output + res_skip_acts[:,self.hidden_channels:,:]
-      else:
-        output = output + res_skip_acts
-    return output * x_mask
-  def remove_weight_norm(self):
-    if self.gin_channels != 0:
-      torch.nn.utils.remove_weight_norm(self.cond_layer)
-    for l in self.in_layers:
-      torch.nn.utils.remove_weight_norm(l)
-    for l in self.res_skip_layers:
-     torch.nn.utils.remove_weight_norm(l)
 class ResBlock1(torch.nn.Module):
@@ -209,11 +206,11 @@ class ResBlock1(torch.nn.Module):
     def forward(self, x, x_mask=None):
         for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
             if x_mask is not None:
                 xt = xt * x_mask
             xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
             if x_mask is not None:
                 xt = xt * x_mask
             xt = c2(xt)
@@ -242,7 +239,7 @@ class ResBlock2(torch.nn.Module):
     def forward(self, x, x_mask=None):
         for c in self.convs:
-            xt = F.leaky_relu(x, LRELU_SLOPE)
             if x_mask is not None:
                 xt = xt * x_mask
             xt = c(xt)
@@ -257,134 +254,135 @@ class ResBlock2(torch.nn.Module):
 class Log(nn.Module):
-  def forward(self, x, x_mask, reverse=False, **kwargs):
-    if not reverse:
-      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
-      logdet = torch.sum(-y, [1, 2])
-      return y, logdet
-    else:
-      x = torch.exp(x) * x_mask
-      return x
 class Flip(nn.Module):
-  def forward(self, x, *args, reverse=False, **kwargs):
-    x = torch.flip(x, [1])
-    if not reverse:
-      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
-      return x, logdet
-    else:
-      return x
 class ElementwiseAffine(nn.Module):
-  def __init__(self, channels):
-    super().__init__()
-    self.channels = channels
-    self.m = nn.Parameter(torch.zeros(channels,1))
-    self.logs = nn.Parameter(torch.zeros(channels,1))
-  def forward(self, x, x_mask, reverse=False, **kwargs):
-    if not reverse:
-      y = self.m + torch.exp(self.logs) * x
-      y = y * x_mask
-      logdet = torch.sum(self.logs * x_mask, [1,2])
-      return y, logdet
-    else:
-      x = (x - self.m) * torch.exp(-self.logs) * x_mask
-      return x
 class ResidualCouplingLayer(nn.Module):
-  def __init__(self,
-      channels,
-      hidden_channels,
-      kernel_size,
-      dilation_rate,
-      n_layers,
-      p_dropout=0,
-      gin_channels=0,
-      mean_only=False):
-    assert channels % 2 == 0, "channels should be divisible by 2"
-    super().__init__()
-    self.channels = channels
-    self.hidden_channels = hidden_channels
-    self.kernel_size = kernel_size
-    self.dilation_rate = dilation_rate
-    self.n_layers = n_layers
-    self.half_channels = channels // 2
-    self.mean_only = mean_only
-    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
-    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
-    self.post.weight.data.zero_()
-    self.post.bias.data.zero_()
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
-    h = self.pre(x0) * x_mask
-    h = self.enc(h, x_mask, g=g)
-    stats = self.post(h) * x_mask
-    if not self.mean_only:
-      m, logs = torch.split(stats, [self.half_channels]*2, 1)
-    else:
-      m = stats
-      logs = torch.zeros_like(m)
-    if not reverse:
-      x1 = m + x1 * torch.exp(logs) * x_mask
-      x = torch.cat([x0, x1], 1)
-      logdet = torch.sum(logs, [1,2])
-      return x, logdet
-    else:
-      x1 = (x1 - m) * torch.exp(-logs) * x_mask
-      x = torch.cat([x0, x1], 1)
-      return x
 class ConvFlow(nn.Module):
-  def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
-    super().__init__()
-    self.in_channels = in_channels
-    self.filter_channels = filter_channels
-    self.kernel_size = kernel_size
-    self.n_layers = n_layers
-    self.num_bins = num_bins
-    self.tail_bound = tail_bound
-    self.half_channels = in_channels // 2
-    self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
-    self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
-    self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
-    self.proj.weight.data.zero_()
-    self.proj.bias.data.zero_()
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
-    h = self.pre(x0)
-    h = self.convs(h, x_mask, g=g)
-    h = self.proj(h) * x_mask
-    b, c, t = x0.shape
-    h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
-    unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
-    unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
-    unnormalized_derivatives = h[..., 2 * self.num_bins:]
-    x1, logabsdet = piecewise_rational_quadratic_transform(x1,
-        unnormalized_widths,
-        unnormalized_heights,
-        unnormalized_derivatives,
-        inverse=reverse,
-        tails='linear',
-        tail_bound=self.tail_bound
-    )
-    x = torch.cat([x0, x1], 1) * x_mask
-    logdet = torch.sum(logabsdet * x_mask, [1,2])
-    if not reverse:
-        return x, logdet
-    else:
-        return x

 import math
 import torch
 from torch import nn
+from torch.nn import Conv1d
+from torch.nn import functional as t_func
 from torch.nn.utils import weight_norm, remove_weight_norm
 import commons
 from commons import init_weights, get_padding
 from transforms import piecewise_rational_quadratic_transform
 LRELU_SLOPE = 0.1
 class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = t_func.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
 class ConvReluNorm(nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(
+            nn.ReLU(),
+            nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
 class DDSConv(nn.Module):
+    """
+    Dialted and Depth-Separable Convolution
+    """
+    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.drop = nn.Dropout(p_dropout)
+        self.convs_sep = nn.ModuleList()
+        self.convs_1x1 = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(n_layers):
+            dilation = kernel_size ** i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
+                                            groups=channels, dilation=dilation, padding=padding
+                                            ))
+            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(LayerNorm(channels))
+            self.norms_2.append(LayerNorm(channels))
+    def forward(self, x, x_mask, g=None):
+        if g is not None:
+            x = x + g
+        for i in range(self.n_layers):
+            y = self.convs_sep[i](x * x_mask)
+            y = self.norms_1[i](y)
+            y = t_func.gelu(y)
+            y = self.convs_1x1[i](y)
+            y = self.norms_2[i](y)
+            y = t_func.gelu(y)
+            y = self.drop(y)
+            x = x + y
+        return x * x_mask
 class WN(torch.nn.Module):
+    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+        super(WN, self).__init__()
+        assert (kernel_size % 2 == 1)
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size,
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        if gin_channels != 0:
+            cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+        for i in range(n_layers):
+            dilation = dilation_rate ** i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
+                                       dilation=dilation, padding=padding)
+            in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+            self.in_layers.append(in_layer)
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+        if g is not None:
+            g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+            acts = commons.fused_add_tanh_sigmoid_multiply(
+                x_in,
+                g_l,
+                n_channels_tensor)
+            acts = self.drop(acts)
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, :self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels:, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)
 class ResBlock1(torch.nn.Module):
     def forward(self, x, x_mask=None):
         for c1, c2 in zip(self.convs1, self.convs2):
+            xt = t_func.leaky_relu(x, LRELU_SLOPE)
             if x_mask is not None:
                 xt = xt * x_mask
             xt = c1(xt)
+            xt = t_func.leaky_relu(xt, LRELU_SLOPE)
             if x_mask is not None:
                 xt = xt * x_mask
             xt = c2(xt)
     def forward(self, x, x_mask=None):
         for c in self.convs:
+            xt = t_func.leaky_relu(x, LRELU_SLOPE)
             if x_mask is not None:
                 xt = xt * x_mask
             xt = c(xt)
 class Log(nn.Module):
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+            logdet = torch.sum(-y, [1, 2])
+            return y, logdet
+        else:
+            x = torch.exp(x) * x_mask
+            return x
 class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
 class ElementwiseAffine(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = channels
+        self.m = nn.Parameter(torch.zeros(channels, 1))
+        self.logs = nn.Parameter(torch.zeros(channels, 1))
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = self.m + torch.exp(self.logs) * x
+            y = y * x_mask
+            logdet = torch.sum(self.logs * x_mask, [1, 2])
+            return y, logdet
+        else:
+            x = (x - self.m) * torch.exp(-self.logs) * x_mask
+            return x
 class ResidualCouplingLayer(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 p_dropout=0,
+                 gin_channels=0,
+                 mean_only=False):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
+                      gin_channels=gin_channels)
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
 class ConvFlow(nn.Module):
+    def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.num_bins = num_bins
+        self.tail_bound = tail_bound
+        self.half_channels = in_channels // 2
+        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
+        self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0)
+        h = self.convs(h, x_mask, g=g)
+        h = self.proj(h) * x_mask
+        b, c, t = x0.shape
+        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
+        unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_derivatives = h[..., 2 * self.num_bins:]
+        x1, logabsdet = piecewise_rational_quadratic_transform(x1,
+                                                               unnormalized_widths,
+                                                               unnormalized_heights,
+                                                               unnormalized_derivatives,
+                                                               inverse=reverse,
+                                                               tails='linear',
+                                                               tail_bound=self.tail_bound
+                                                               )
+        x = torch.cat([x0, x1], 1) * x_mask
+        logdet = torch.sum(logabsdet * x_mask, [1, 2])
+        if not reverse:
+            return x, logdet
+        else:
+            return x

preprocess_wave.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import librosa
+import numpy as np
+import pyworld
+from scipy.io import wavfile
+import utils
+class FeatureInput(object):
+    def __init__(self, samplerate=16000, hop_size=160):
+        self.fs = samplerate
+        self.hop = hop_size
+        self.f0_bin = 256
+        self.f0_max = 1100.0
+        self.f0_min = 50.0
+        self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
+        self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+    def compute_f0(self, path):
+        x, sr = librosa.load(path, sr=self.fs)
+        assert sr == self.fs
+        f0, t = pyworld.dio(
+            x.astype(np.double),
+            fs=sr,
+            f0_ceil=800,
+            frame_period=1000 * self.hop / sr,
+        )
+        f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
+        for index, pitch in enumerate(f0):
+            f0[index] = round(pitch, 1)
+        return f0
+    # for numpy # code from diffsinger
+    def coarse_f0(self, f0):
+        f0_mel = 1127 * np.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
+                self.f0_bin - 2
+        ) / (self.f0_mel_max - self.f0_mel_min) + 1
+        # use 0 or 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
+        f0_coarse = np.rint(f0_mel).astype(np.int)
+        assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
+            f0_coarse.max(),
+            f0_coarse.min(),
+        )
+        return f0_coarse
+    # for tensor # code from diffsinger
+    def coarse_f0_ts(self, f0):
+        f0_mel = 1127 * (1 + f0 / 700).log()
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
+                self.f0_bin - 2
+        ) / (self.f0_mel_max - self.f0_mel_min) + 1
+        # use 0 or 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
+        f0_coarse = (f0_mel + 0.5).long()
+        assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
+            f0_coarse.max(),
+            f0_coarse.min(),
+        )
+        return f0_coarse
+    def save_wav(self, wav, path):
+        wav *= 32767 / max(0.01, np.max(np.abs(wav))) * 0.6
+        wavfile.write(path, self.fs, wav.astype(np.int16))
+if __name__ == "__main__":
+    wavPath = "./data/waves"
+    outPath = "./data/label"
+    if not os.path.exists("./data/label"):
+        os.mkdir("./data/label")
+    # define model and load checkpoint
+    hps = utils.get_hparams_from_file("./configs/singing_base.json")
+    featureInput = FeatureInput(hps.data.sampling_rate, hps.data.hop_length)
+    vits_file = open("./filelists/vc_file.txt", "w", encoding="utf-8")
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{outPath}/{spks}")
+            for file in os.listdir(f"./{wavPath}/{spks}"):
+                if file.endswith(".wav"):
+                    file = file[:-4]
+                    audio_path = f"./{wavPath}/{spks}/{file}.wav"
+                    featur_pit = featureInput.compute_f0(audio_path)
+                    coarse_pit = featureInput.coarse_f0(featur_pit)
+                    np.save(
+                        f"{outPath}/{spks}/{file}_pitch.npy",
+                        coarse_pit,
+                        allow_pickle=False,
+                    )
+                    np.save(
+                        f"{outPath}/{spks}/{file}_nsff0.npy",
+                        featur_pit,
+                        allow_pickle=False,
+                    )
+                    path_audio = f"./data/waves/{spks}/{file}.wav"
+                    path_spkid = f"./data/spkid/{spks}.npy"
+                    path_label = (
+                        f"./data/phone/{spks}/{file}.npy"  # phone means ppg & hubert
+                    )
+                    path_pitch = f"./data/label/{spks}/{file}_pitch.npy"
+                    path_nsff0 = f"./data/label/{spks}/{file}_nsff0.npy"
+                    print(
+                        f"{path_audio}|{path_spkid}|{path_label}|{path_pitch}|{path_nsff0}",
+                        file=vits_file,
+                    )
+    vits_file.close()

requirements.txt CHANGED Viewed

@@ -1,16 +1,16 @@
 Cython==0.29.21
 librosa==0.8.0
-matplotlib
-phonemizer
-scipy
 torch
 torchvision
-Unidecode
 torchaudio
 pyworld
 keras
 mir-eval
 pretty-midi
-tensorflow
-numpy
-pydub

 Cython==0.29.21
 librosa==0.8.0
+matplotlib==3.3.1
+numpy==1.18.5
+phonemizer==2.2.1
+scipy==1.5.2
 torch
 torchvision
+Unidecode==1.1.1
 torchaudio
 pyworld
+scipy
 keras
 mir-eval
 pretty-midi
+pydub

text/LICENSE DELETED Viewed

@@ -1,19 +0,0 @@
-Copyright (c) 2017 Keith Ito
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.

text/__init__.py DELETED Viewed

@@ -1,54 +0,0 @@
-""" from https://github.com/keithito/tacotron """
-from text import cleaners
-from text.symbols import symbols
-# Mappings from symbol to numeric ID and vice versa:
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-def text_to_sequence(text, cleaner_names):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-      cleaner_names: names of the cleaner functions to run the text through
-    Returns:
-      List of integers corresponding to the symbols in the text
-  '''
-  sequence = []
-  clean_text = _clean_text(text, cleaner_names)
-  for symbol in clean_text:
-    symbol_id = _symbol_to_id[symbol]
-    sequence += [symbol_id]
-  return sequence
-def cleaned_text_to_sequence(cleaned_text):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-    Returns:
-      List of integers corresponding to the symbols in the text
-  '''
-  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
-  return sequence
-def sequence_to_text(sequence):
-  '''Converts a sequence of IDs back to a string'''
-  result = ''
-  for symbol_id in sequence:
-    s = _id_to_symbol[symbol_id]
-    result += s
-  return result
-def _clean_text(text, cleaner_names):
-  for name in cleaner_names:
-    cleaner = getattr(cleaners, name)
-    if not cleaner:
-      raise Exception('Unknown cleaner: %s' % name)
-    text = cleaner(text)
-  return text

text/cleaners.py DELETED Viewed

@@ -1,100 +0,0 @@
-""" from https://github.com/keithito/tacotron """
-'''
-Cleaners are transformations that run over the input text at both training and eval time.
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-'''
-import re
-from unidecode import unidecode
-from phonemizer import phonemize
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
-  ('mrs', 'misess'),
-  ('mr', 'mister'),
-  ('dr', 'doctor'),
-  ('st', 'saint'),
-  ('co', 'company'),
-  ('jr', 'junior'),
-  ('maj', 'major'),
-  ('gen', 'general'),
-  ('drs', 'doctors'),
-  ('rev', 'reverend'),
-  ('lt', 'lieutenant'),
-  ('hon', 'honorable'),
-  ('sgt', 'sergeant'),
-  ('capt', 'captain'),
-  ('esq', 'esquire'),
-  ('ltd', 'limited'),
-  ('col', 'colonel'),
-  ('ft', 'fort'),
-]]
-def expand_abbreviations(text):
-  for regex, replacement in _abbreviations:
-    text = re.sub(regex, replacement, text)
-  return text
-def expand_numbers(text):
-  return normalize_numbers(text)
-def lowercase(text):
-  return text.lower()
-def collapse_whitespace(text):
-  return re.sub(_whitespace_re, ' ', text)
-def convert_to_ascii(text):
-  return unidecode(text)
-def basic_cleaners(text):
-  '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
-  text = lowercase(text)
-  text = collapse_whitespace(text)
-  return text
-def transliteration_cleaners(text):
-  '''Pipeline for non-English text that transliterates to ASCII.'''
-  text = convert_to_ascii(text)
-  text = lowercase(text)
-  text = collapse_whitespace(text)
-  return text
-def english_cleaners(text):
-  '''Pipeline for English text, including abbreviation expansion.'''
-  text = convert_to_ascii(text)
-  text = lowercase(text)
-  text = expand_abbreviations(text)
-  phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
-  phonemes = collapse_whitespace(phonemes)
-  return phonemes
-def english_cleaners2(text):
-  '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
-  text = convert_to_ascii(text)
-  text = lowercase(text)
-  text = expand_abbreviations(text)
-  phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
-  phonemes = collapse_whitespace(phonemes)
-  return phonemes

text/symbols.py DELETED Viewed

@@ -1,16 +0,0 @@
-""" from https://github.com/keithito/tacotron """
-'''
-Defines the set of symbols used in text input to the model.
-'''
-_pad        = '_'
-_punctuation = ';:,.!?¡¿—…"«»“” '
-_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
-_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
-# Export all symbols:
-symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
-# Special symbol ids
-SPACE_ID = symbols.index(" ")