Soprano-RVC

Runtime error

App Files Files Community

NeoPy commited on 13 days ago

Commit

05aac64

verified ·

1 Parent(s): f8a7cd6

EXP

Browse files

Files changed (30) hide show

RVC/modules/attentions.py +138 -0
RVC/modules/commons.py +56 -0
RVC/modules/config.py +42 -0
RVC/modules/cut.py +160 -0
RVC/modules/download.py +132 -0
RVC/modules/encoders.py +96 -0
RVC/modules/fairseq.py +1396 -0
RVC/modules/gdown.py +100 -0
RVC/modules/generator.py +257 -0
RVC/modules/hifigan.py +60 -0
RVC/modules/mediafire.py +30 -0
RVC/modules/meganz.py +122 -0
RVC/modules/modules.py +60 -0
RVC/modules/mrf_hifigan.py +150 -0
RVC/modules/noisereduce.py +196 -0
RVC/modules/normalization.py +15 -0
RVC/modules/nsf_hifigan.py +116 -0
RVC/modules/opencl.py +199 -0
RVC/modules/pipeline.py +215 -0
RVC/modules/pixeldrain.py +16 -0
RVC/modules/pyworld.py +84 -0
RVC/modules/refinegan.py +170 -0
RVC/modules/residuals.py +140 -0
RVC/modules/rms.py +30 -0
RVC/modules/rmvpe.py +260 -0
RVC/modules/swipe.py +200 -0
RVC/modules/synthesizers.py +84 -0
RVC/modules/torchcrepe.py +185 -0
RVC/modules/torchfcpe.py +951 -0
RVC/modules/utils.py +94 -0

RVC/modules/attentions.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+import sys
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+sys.path.append(os.getcwd())
+from modules.commons import convert_pad_shape
+class MultiHeadAttention(nn.Module):
+    def __init__(self, channels, out_channels, n_heads, p_dropout=0.0, window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        nn.init.xavier_uniform_(self.conv_o.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        return self.conv_o(x)
+    def attention(self, query, key, value, mask=None):
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (t_s == t_t)
+            scores += self._relative_position_to_absolute_position(self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), self._get_relative_embeddings(self.emb_rel_k, t_s)))
+        if self.proximal_bias:
+            assert t_s == t_t
+            scores += self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (t_s == t_t)
+                scores = scores.masked_fill((torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)) == 0, -1e4)
+        p_attn = self.drop(F.softmax(scores, dim=-1))
+        output = torch.matmul(p_attn, value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3))
+        if self.window_size is not None: output += self._matmul_with_relative_values(self._absolute_position_to_relative_position(p_attn), self._get_relative_embeddings(self.emb_rel_v, t_s))
+        return (output.transpose(2, 3).contiguous().view(b, d, t_t)), p_attn
+    def _matmul_with_relative_values(self, x, y):
+        return torch.matmul(x, y.unsqueeze(0))
+    def _matmul_with_relative_keys(self, x, y):
+        return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        return (F.pad(relative_embeddings, convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) if pad_length > 0 else relative_embeddings)[:, slice_start_position:(slice_start_position + 2 * length - 1)]
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        return F.pad(F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])).view([batch, heads, length * 2 * length]), convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])).view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :]
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        return F.pad(F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])).view([batch, heads, length**2 + length * (length - 1)]), convert_pad_shape([[0, 0], [0, 0], [length, 0]])).view([batch, heads, length, 2 * length])[:, :, :, 1:]
+    def _attention_bias_proximal(self, length):
+        r = torch.arange(length, dtype=torch.float32)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs((torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)))), 0), 0)
+class FFN(nn.Module):
+    def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0, activation=None, causal=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        self.padding = self._causal_padding if causal else self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        return self.conv_2(self.padding(self.drop(((x * torch.sigmoid(1.702 * x)) if self.activation == "gelu" else torch.relu(x))) * x_mask)) * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1: return x
+        return F.pad(x, convert_pad_shape([[0, 0], [0, 0], [(self.kernel_size - 1), 0]]))
+    def _same_padding(self, x):
+        if self.kernel_size == 1: return x
+        return F.pad(x, convert_pad_shape([[0, 0], [0, 0], [((self.kernel_size - 1) // 2), (self.kernel_size // 2)]]))

RVC/modules/commons.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+def init_weights(m, mean=0.0, std=0.01):
+    if m.__class__.__name__.find("Conv") != -1: m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def convert_pad_shape(pad_shape):
+    return [item for sublist in pad_shape[::-1] for item in sublist]
+def slice_segments(x, ids_str, segment_size = 4, dim = 2):
+    if dim == 2: ret = torch.zeros_like(x[:, :segment_size])
+    elif dim == 3: ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i].item()
+        idx_end = idx_str + segment_size
+        if dim == 2: ret[i] = x[i, idx_str:idx_end]
+        else: ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, _, t = x.size()
+    if x_lengths is None: x_lengths = t
+    ids_str = (torch.rand([b]).to(device=x.device) * (x_lengths - segment_size + 1)).to(dtype=torch.long)
+    return slice_segments(x, ids_str, segment_size, dim=3), ids_str
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    return torch.tanh(in_act[:, :n_channels_int, :]) * torch.sigmoid(in_act[:, n_channels_int:, :])
+def sequence_mask(length, max_length = None):
+    if max_length is None: max_length = length.max()
+    return torch.arange(max_length, dtype=length.dtype, device=length.device).unsqueeze(0) < length.unsqueeze(1)
+def clip_grad_value(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor): parameters = [parameters]
+    norm_type = float(norm_type)
+    if clip_value is not None: clip_value = float(clip_value)
+    total_norm = 0
+    for p in list(filter(lambda p: p.grad is not None, parameters)):
+        total_norm += (p.grad.data.norm(norm_type)).item() ** norm_type
+        if clip_value is not None: p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    return total_norm ** (1.0 / norm_type)

RVC/modules/config.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import sys
+import torch
+sys.path.append(os.getcwd())
+from modules import opencl
+def singleton(cls):
+    instances = {}
+    def get_instance(*args, **kwargs):
+        if cls not in instances: instances[cls] = cls(*args, **kwargs)
+        return instances[cls]
+    return get_instance
+@singleton
+class Config:
+    def __init__(self, cpu_mode=False, is_half=False):
+        self.device = "cuda:0" if torch.cuda.is_available() else ("ocl:0" if opencl.is_available() else "cpu")
+        self.is_half = is_half
+        self.gpu_mem = None
+        self.cpu_mode = cpu_mode
+        if cpu_mode: self.device = "cpu"
+    def device_config(self):
+        if not self.cpu_mode:
+            if self.device.startswith("cuda"): self.set_cuda_config()
+            elif opencl.is_available(): self.device = "ocl:0"
+            elif self.has_mps(): self.device = "mps"
+            else: self.device = "cpu"
+        if self.gpu_mem is not None and self.gpu_mem <= 4: return 1, 5, 30, 32
+        return (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
+    def set_cuda_config(self):
+        i_device = int(self.device.split(":")[-1])
+        self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (1024**3)
+    def has_mps(self):
+        return torch.backends.mps.is_available()

RVC/modules/cut.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import numpy as np
+class Slicer:
+    def __init__(self, sr, threshold = -40.0, min_length = 5000, min_interval = 300, hop_size = 20, max_sil_kept = 5000):
+        min_interval = sr * min_interval / 1000
+        self.threshold = 10 ** (threshold / 20.0)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+    def _apply_slice(self, waveform, begin, end):
+        start_idx = begin * self.hop_size
+        return waveform[:, start_idx:min(waveform.shape[1], end * self.hop_size)] if len(waveform.shape) > 1 else waveform[start_idx:min(waveform.shape[0], end * self.hop_size)]
+    def slice(self, waveform):
+        samples = waveform.mean(axis=0) if len(waveform.shape) > 1 else waveform
+        if samples.shape[0] <= self.min_length: return [waveform]
+        rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
+        sil_tags = []
+        silence_start, clip_start = None, 0
+        for i, rms in enumerate(rms_list):
+            if rms < self.threshold:
+                if silence_start is None: silence_start = i
+                continue
+            if silence_start is None: continue
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = (i - silence_start >= self.min_interval and i - clip_start >= self.min_length)
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start : i + 1].argmin() + silence_start
+                sil_tags.append((0, pos) if silence_start == 0 else (pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin()
+                pos += i - self.max_sil_kept
+                pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept)
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept)
+                sil_tags.append((0, pos_r) if silence_start == 0 else ((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos_r))
+                clip_start = pos_r
+            silence_start = None
+        total_frames = rms_list.shape[0]
+        if (silence_start is not None and total_frames - silence_start >= self.min_interval): sil_tags.append((rms_list[silence_start : min(total_frames, silence_start + self.max_sil_kept) + 1].argmin() + silence_start, total_frames + 1))
+        if not sil_tags: return [waveform]
+        else:
+            chunks = []
+            if sil_tags[0][0] > 0: chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
+            for i in range(len(sil_tags) - 1):
+                chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
+            if sil_tags[-1][1] < total_frames: chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
+            return chunks
+class Slicer2(Slicer):
+    def slice2(self, waveform):
+        samples = waveform.mean(axis=0) if len(waveform.shape) > 1 else waveform
+        if samples.shape[0] <= self.min_length: return [(waveform, 0, samples.shape[0])]
+        rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
+        sil_tags = []
+        silence_start, clip_start = None, 0
+        for i, rms in enumerate(rms_list):
+            if rms < self.threshold:
+                if silence_start is None: silence_start = i
+                continue
+            if silence_start is None: continue
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = (i - silence_start >= self.min_interval and i - clip_start >= self.min_length)
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start : i + 1].argmin() + silence_start
+                sil_tags.append((0, pos) if silence_start == 0 else (pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin()
+                pos += i - self.max_sil_kept
+                pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept)
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept)
+                sil_tags.append((0, pos_r) if silence_start == 0 else ((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos_r))
+                clip_start = pos_r
+            silence_start = None
+        total_frames = rms_list.shape[0]
+        if (silence_start is not None and total_frames - silence_start >= self.min_interval): sil_tags.append((rms_list[silence_start : min(total_frames, silence_start + self.max_sil_kept) + 1].argmin() + silence_start, total_frames + 1))
+        if not sil_tags: return [(waveform, 0, samples.shape[-1])]
+        else:
+            chunks = []
+            if sil_tags[0][0] > 0: chunks.append((self._apply_slice(waveform, 0, sil_tags[0][0]), 0, sil_tags[0][0] * self.hop_size))
+            for i in range(len(sil_tags) - 1):
+                chunks.append((self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]), sil_tags[i][1] * self.hop_size, sil_tags[i + 1][0] * self.hop_size))
+            if sil_tags[-1][1] < total_frames: chunks.append((self._apply_slice(waveform, sil_tags[-1][1], total_frames), sil_tags[-1][1] * self.hop_size, samples.shape[-1]))
+            return chunks
+def get_rms(y, frame_length=2048, hop_length=512, pad_mode="constant"):
+    y = np.pad(y, (int(frame_length // 2), int(frame_length // 2)), mode=pad_mode)
+    axis = -1
+    x_shape_trimmed = list(y.shape)
+    x_shape_trimmed[axis] -= frame_length - 1
+    xw = np.moveaxis(np.lib.stride_tricks.as_strided(y, shape=tuple(x_shape_trimmed) + tuple([frame_length]), strides=y.strides + tuple([y.strides[axis]])), -1, axis - 1 if axis < 0 else axis + 1)
+    slices = [slice(None)] * xw.ndim
+    slices[axis] = slice(0, None, hop_length)
+    return np.sqrt(np.mean(np.abs(xw[tuple(slices)]) ** 2, axis=-2, keepdims=True))
+def cut(audio, sr, db_thresh=-60, min_interval=250):
+    slicer = Slicer2(sr=sr, threshold=db_thresh, min_interval=min_interval)
+    return slicer.slice2(audio)
+def restore(segments, total_len, dtype=np.float32):
+    out = []
+    last_end = 0
+    for start, end, processed_seg in segments:
+        if start > last_end: out.append(np.zeros(start - last_end, dtype=dtype))
+        out.append(processed_seg)
+        last_end = end
+    if last_end < total_len: out.append(np.zeros(total_len - last_end, dtype=dtype))
+    return np.concatenate(out, axis=-1)

RVC/modules/download.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import re
+import os
+import sys
+import shutil
+sys.path.append(os.getcwd())
+from modules.utils import HF_download_file
+from modules import gdown, meganz, mediafire, pixeldrain
+def move_files_from_directory(src_dir, dest_models, model_name):
+    for root, _, files in os.walk(src_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".index"):
+                filepath = os.path.join(dest_models, file.replace(' ', '_').replace('(', '').replace(')', '').replace('[', '').replace(']', '').replace(",", "").replace('"', "").replace("'", "").replace("|", "").strip())
+                shutil.move(file_path, filepath)
+            elif file.endswith(".pth") and not file.startswith("D_") and not file.startswith("G_"):
+                pth_path = os.path.join(dest_models, model_name + ".pth")
+                shutil.move(file_path, pth_path)
+def save_drop_model(dropbox):
+    model_folders = "rvc_models"
+    save_model_temp = "save_model_temp"
+    if not os.path.exists(model_folders): os.makedirs(model_folders, exist_ok=True)
+    if not os.path.exists(save_model_temp): os.makedirs(save_model_temp, exist_ok=True)
+    shutil.move(dropbox, save_model_temp)
+    try:
+        print("[INFO] Start uploading...")
+        file_name = os.path.basename(dropbox)
+        model_folders = os.path.join(model_folders, file_name.replace(".zip", "").replace(".pth", "").replace(".index", ""))
+        if file_name.endswith(".zip"):
+            shutil.unpack_archive(os.path.join(save_model_temp, file_name), save_model_temp)
+            move_files_from_directory(save_model_temp, model_folders, file_name.replace(".zip", ""))
+        elif file_name.endswith(".pth"):
+            output_file = os.path.join(model_folders, file_name)
+            shutil.move(os.path.join(save_model_temp, file_name), output_file)
+        elif file_name.endswith(".index"):
+            def extract_name_model(filename):
+                match = re.search(r"([A-Za-z]+)(?=_v|\.|$)", filename)
+                return match.group(1) if match else None
+            model_logs = os.path.join(model_folders, extract_name_model(file_name))
+            if not os.path.exists(model_logs): os.makedirs(model_logs, exist_ok=True)
+            shutil.move(os.path.join(save_model_temp, file_name), model_logs)
+        else:
+            print("[WARNING] Format not supported. Supported formats ('.zip', '.pth', '.index')")
+            return
+        print("[INFO] Completed upload.")
+    except Exception as e:
+        print(f"[ERROR] An error occurred during unpack: {e}")
+    finally:
+        shutil.rmtree(save_model_temp, ignore_errors=True)
+def download_model(url=None, model=None):
+    if not url:
+        print("[WARNING] Please provide a valid url.")
+        return
+    if not model:
+        print("[WARNING] Please provide a valid model name.")
+        return
+    model = model.replace(".pth", "").replace(".index", "").replace(".zip", "").replace(" ", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "").strip()
+    url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip()
+    download_dir = "download_model"
+    model_folders = "rvc_models"
+    if not os.path.exists(download_dir): os.makedirs(download_dir, exist_ok=True)
+    if not os.path.exists(model_folders): os.makedirs(model_folders, exist_ok=True)
+    model_folders = os.path.join(model_folders, model)
+    os.makedirs(model_folders, exist_ok=True)
+    try:
+        print("[INFO] Start downloading...")
+        if url.endswith(".pth"): HF_download_file(url, os.path.join(model_folders, f"{model}.pth"))
+        elif url.endswith(".index"): HF_download_file(url, os.path.join(model_folders, f"{model}.index"))
+        elif url.endswith(".zip"):
+            output_path = HF_download_file(url, os.path.join(download_dir, model + ".zip"))
+            shutil.unpack_archive(output_path, download_dir)
+            move_files_from_directory(download_dir, model_folders, model)
+        else:
+            if "drive.google.com" in url or "drive.usercontent.google.com" in url:
+                file_id = None
+                if "/file/d/" in url: file_id = url.split("/d/")[1].split("/")[0]
+                elif "open?id=" in url: file_id = url.split("open?id=")[1].split("/")[0]
+                elif "/download?id=" in url: file_id = url.split("/download?id=")[1].split("&")[0]
+                if file_id:
+                    file = gdown.gdown_download(id=file_id, output=download_dir)
+                    if file.endswith(".zip"): shutil.unpack_archive(file, download_dir)
+                    move_files_from_directory(download_dir, model_folders, model)
+            elif "mega.nz" in url:
+                meganz.mega_download_url(url, download_dir)
+                file_download = next((f for f in os.listdir(download_dir)), None)
+                if file_download.endswith(".zip"): shutil.unpack_archive(os.path.join(download_dir, file_download), download_dir)
+                move_files_from_directory(download_dir, model_folders, model)
+            elif "mediafire.com" in url:
+                file = mediafire.Mediafire_Download(url, download_dir)
+                if file.endswith(".zip"): shutil.unpack_archive(file, download_dir)
+                move_files_from_directory(download_dir, model_folders, model)
+            elif "pixeldrain.com" in url:
+                file = pixeldrain.pixeldrain(url, download_dir)
+                if file.endswith(".zip"): shutil.unpack_archive(file, download_dir)
+                move_files_from_directory(download_dir, model_folders, model)
+            else:
+                print("[WARNING] The url path is not supported.")
+                return
+        print("[INFO] Model download complete.")
+    except Exception as e:
+        print(f"[INFO] An error has occurred: {e}")
+    finally:
+        shutil.rmtree(download_dir, ignore_errors=True)

RVC/modules/encoders.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import sys
+import math
+import torch
+sys.path.append(os.getcwd())
+from modules.modules import WaveNet
+from modules.commons import sequence_mask
+from modules.normalization import LayerNorm
+from modules.attentions import MultiHeadAttention, FFN
+class Encoder(torch.nn.Module):
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.0, window_size=10, **kwargs):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = torch.nn.Dropout(p_dropout)
+        self.attn_layers = torch.nn.ModuleList()
+        self.norm_layers_1 = torch.nn.ModuleList()
+        self.ffn_layers = torch.nn.ModuleList()
+        self.norm_layers_2 = torch.nn.ModuleList()
+        for _ in range(self.n_layers):
+            self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            x = self.norm_layers_1[i](x + self.drop(self.attn_layers[i](x, x, attn_mask)))
+            x = self.norm_layers_2[i](x + self.drop(self.ffn_layers[i](x, x_mask)))
+        return x * x_mask
+class TextEncoder(torch.nn.Module):
+    def __init__(self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, embedding_dim, f0=True, energy=False, onnx=False):
+        super(TextEncoder, self).__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = float(p_dropout)
+        self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True)
+        self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels)
+        self.emb_pitch = torch.nn.Embedding(256, hidden_channels) if f0 else None
+        self.emb_energy = torch.nn.Linear(1, hidden_channels) if energy else None
+        self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout), onnx=onnx)
+        self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths, energy):
+        x = self.emb_phone(phone)
+        if pitch is not None: x += self.emb_pitch(pitch)
+        if energy is not None: x += self.emb_energy(energy.unsqueeze(-1))
+        x = torch.transpose(self.lrelu(x * math.sqrt(self.hidden_channels)), 1, -1)
+        x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
+        m, logs = torch.split((self.proj(self.encoder(x * x_mask, x_mask)) * x_mask), self.out_channels, dim=1)
+        return m, logs, x_mask
+class PosteriorEncoder(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0):
+        super(PosteriorEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = WaveNet(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+        self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g = None):
+        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        m, logs = torch.split((self.proj(self.enc((self.pre(x) * x_mask), x_mask, g=g)) * x_mask), self.out_channels, dim=1)
+        return ((m + torch.randn_like(m) * torch.exp(logs)) * x_mask), m, logs, x_mask
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()

RVC/modules/fairseq.py ADDED Viewed

	@@ -0,0 +1,1396 @@

+import re
+import sys
+import math
+import uuid
+import torch
+import types
+import contextlib
+import numpy as np
+import torch.nn.functional as F
+from torch import nn
+from omegaconf import DictConfig, open_dict
+class Dictionary:
+    def __init__(self, *args, **kwargs):
+        pass
+fairseq = types.ModuleType("fairseq")
+fairseq_data = types.ModuleType("fairseq.data")
+fairseq_data_dictionary = types.ModuleType("fairseq.data.dictionary")
+fairseq_data_dictionary.Dictionary = Dictionary
+fairseq.data = fairseq_data
+fairseq_data.dictionary = fairseq_data_dictionary
+sys.modules["fairseq"] = fairseq
+sys.modules["fairseq.data"] = fairseq_data
+sys.modules["fairseq.data.dictionary"] = fairseq_data_dictionary
+def load_model(filename):
+    state = torch.load(filename, map_location="cpu")
+    model = HubertModel(HubertConfig(**state['cfg']['model']))
+    model.load_state_dict(state['model'], strict=False)
+    return model
+def softmax(x, dim, onnx_trace = False):
+    return F.softmax(x.float(), dim=dim) if onnx_trace else F.softmax(x, dim=dim, dtype=torch.float32)
+def log_softmax(x, dim, onnx_trace = False):
+    return F.log_softmax(x.float(), dim=dim) if onnx_trace else F.log_softmax(x, dim=dim, dtype=torch.float32)
+def eval_str_dict(x, type=dict):
+    if x is None: return None
+    if isinstance(x, str): x = eval(x)
+    return x
+def with_incremental_state(cls):
+    cls.__bases__ = (FairseqIncrementalState,) + tuple(b for b in cls.__bases__ if b != FairseqIncrementalState)
+    return cls
+def quant_noise(module, p, block_size):
+    if p <= 0: return module
+    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
+    is_conv = module.weight.ndim == 4
+    if not is_conv: assert (module.weight.size(1) % block_size == 0)
+    else:
+        if module.kernel_size == (1, 1): assert (module.in_channels % block_size == 0)
+        else:
+            k = module.kernel_size[0] * module.kernel_size[1]
+            assert k % block_size == 0
+    def _forward_pre_hook(mod, input):
+        if mod.training:
+            if not is_conv:
+                weight = mod.weight
+                in_features = weight.size(1)
+                out_features = weight.size(0)
+                mask = torch.zeros(in_features // block_size * out_features, device=weight.device)
+                mask.bernoulli_(p)
+                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+            else:
+                weight = mod.weight
+                in_channels = mod.in_channels
+                out_channels = mod.out_channels
+                if mod.kernel_size == (1, 1):
+                    mask = torch.zeros(int(in_channels // block_size * out_channels), device=weight.device)
+                    mask.bernoulli_(p)
+                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                else:
+                    mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device)
+                    mask.bernoulli_(p)
+                    mask = (mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]))
+            mask = mask.to(torch.bool)
+            s = 1 / (1 - p)
+            mod.weight.data = s * weight.masked_fill(mask, 0)
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module
+class FairseqDropout(nn.Module):
+    def __init__(self, p, module_name=None):
+        super().__init__()
+        self.p = p
+        self.module_name = module_name
+        self.apply_during_inference = False
+    def forward(self, x, inplace = False):
+        return F.dropout(x, p=self.p, training=True, inplace=inplace) if self.p > 0 and (self.training or self.apply_during_inference) else x
+    def make_generation_fast_(self, name, retain_dropout = False, retain_dropout_modules = None, **kwargs):
+        if retain_dropout:
+            if (retain_dropout_modules is None or self.module_name in retain_dropout_modules): self.apply_during_inference = True
+class FairseqIncrementalState(object):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.init_incremental_state()
+    def init_incremental_state(self):
+        self._incremental_state_id = str(uuid.uuid4())
+    def _get_full_incremental_state_key(self, key):
+        return "{}.{}".format(self._incremental_state_id, key)
+    def get_incremental_state(self, incremental_state, key):
+        full_key = self._get_full_incremental_state_key(key)
+        if incremental_state is None or full_key not in incremental_state: return None
+        return incremental_state[full_key]
+    def set_incremental_state(self, incremental_state, key, value):
+        if incremental_state is not None: incremental_state[self._get_full_incremental_state_key(key)] = value
+        return incremental_state
+class FairseqDecoder(nn.Module):
+    def __init__(self, dictionary):
+        super().__init__()
+        self.dictionary = dictionary
+        self.onnx_trace = False
+        self.adaptive_softmax = None
+    def forward(self, prev_output_tokens, encoder_out=None, **kwargs):
+        x, extra = self.extract_features(prev_output_tokens, encoder_out=encoder_out, **kwargs)
+        return self.output_layer(x), extra
+    def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs):
+        pass
+    def output_layer(self, features, **kwargs):
+        pass
+    def get_normalized_probs(self, net_output, log_probs, sample = None):
+        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+    def get_normalized_probs_scriptable(self, net_output, log_probs, sample = None):
+        if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None:
+            if sample is not None:
+                assert "target" in sample
+                target = sample["target"]
+            else: target = None
+            out = self.adaptive_softmax.get_log_prob(net_output[0], target=target)
+            return out.exp_() if not log_probs else out
+        logits = net_output[0]
+        return log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) if log_probs else softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
+    def max_positions(self):
+        return 1e6
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+@with_incremental_state
+class FairseqIncrementalDecoder(FairseqDecoder):
+    def __init__(self, dictionary):
+        super().__init__(dictionary)
+    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs):
+        pass
+    def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs):
+        pass
+    def reorder_incremental_state(self, incremental_state, new_order):
+        pass
+    def reorder_incremental_state_scripting(self, incremental_state, new_order):
+        for module in self.modules():
+            if hasattr(module, "reorder_incremental_state"):
+                result = module.reorder_incremental_state(incremental_state, new_order)
+                if result is not None: incremental_state = result
+    def set_beam_size(self, beam_size):
+        if getattr(self, "_beam_size", -1) != beam_size:
+            seen = set()
+            def apply_set_beam_size(module):
+                if (module != self and hasattr(module, "set_beam_size") and module not in seen):
+                    seen.add(module)
+                    module.set_beam_size(beam_size)
+            self.apply(apply_set_beam_size)
+            self._beam_size = beam_size
+class MultiheadAttention(FairseqIncrementalDecoder):
+    def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, dictionary=None, q_noise=0.0, qn_block_size=8, xformers_att_config=None, xformers_blocksparse_layout=None, xformers_blocksparse_blocksize=16):
+        super().__init__(dictionary)
+        xformers_att_config = eval_str_dict(xformers_att_config)
+        self.use_xformers = xformers_att_config is not None
+        if self.use_xformers: raise ImportError
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout_module = FairseqDropout(dropout, module_name=self.__class__.__name__)
+        self.head_dim = embed_dim // num_heads
+        assert (self.head_dim * num_heads == self.embed_dim)
+        self.scaling = self.head_dim**-0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim
+        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size)
+        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size)
+        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)
+        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)
+        if add_bias_kv: self.bias_k, self.bias_v = nn.Parameter(torch.Tensor(1, 1, embed_dim)), nn.Parameter(torch.Tensor(1, 1, embed_dim))
+        else: self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.beam_size = 1
+        self.reset_parameters()
+        self.onnx_trace = False
+        self.skip_embed_dim_check = False
+        self.init_incremental_state()
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None: nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None: nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None: nn.init.xavier_normal_(self.bias_v)
+    def _get_reserve_head_index(self, num_heads_to_keep: int):
+        k_proj_heads_norm, q_proj_heads_norm, v_proj_heads_norm = [], [], []
+        for i in range(self.num_heads):
+            start_idx = i * self.head_dim
+            end_idx = (i + 1) * self.head_dim
+            k_proj_heads_norm.append(torch.sum(torch.abs(self.k_proj.weight[start_idx:end_idx])).tolist() + torch.sum(torch.abs(self.k_proj.bias[start_idx:end_idx])).tolist())
+            q_proj_heads_norm.append(torch.sum(torch.abs(self.q_proj.weight[start_idx:end_idx])).tolist() + torch.sum(torch.abs(self.q_proj.bias[start_idx:end_idx])).tolist())
+            v_proj_heads_norm.append(torch.sum(torch.abs(self.v_proj.weight[start_idx:end_idx])).tolist() + torch.sum(torch.abs(self.v_proj.bias[start_idx:end_idx])).tolist())
+        heads_norm = []
+        for i in range(self.num_heads):
+            heads_norm.append(k_proj_heads_norm[i] + q_proj_heads_norm[i] + v_proj_heads_norm[i])
+        sorted_head_index = sorted(range(self.num_heads), key=lambda k: heads_norm[k], reverse=True)
+        reserve_head_index = []
+        for i in range(num_heads_to_keep):
+            reserve_head_index.append((sorted_head_index[i] * self.head_dim, (sorted_head_index[i] + 1) * self.head_dim))
+        return reserve_head_index
+    def _adaptive_prune_heads(self, reserve_head_index):
+        new_q_weight, new_q_bias, new_k_weight, new_k_bias, new_v_weight, new_v_bias, new_out_proj_weight = [], [], [], [], [], [], []
+        for ele in reserve_head_index:
+            start_idx, end_idx = ele
+            new_q_weight.append(self.q_proj.weight[start_idx:end_idx])
+            new_q_bias.append(self.q_proj.bias[start_idx:end_idx])
+            new_k_weight.append(self.k_proj.weight[start_idx:end_idx])
+            new_k_bias.append(self.k_proj.bias[start_idx:end_idx])
+            new_v_weight.append(self.v_proj.weight[start_idx:end_idx])
+            new_v_bias.append(self.v_proj.bias[start_idx:end_idx])
+            new_out_proj_weight.append(self.out_proj.weight[:, start_idx:end_idx])
+        new_q_weight = torch.cat(new_q_weight).detach()
+        new_k_weight = torch.cat(new_k_weight).detach()
+        new_v_weight = torch.cat(new_v_weight).detach()
+        new_out_proj_weight = torch.cat(new_out_proj_weight, dim=-1).detach()
+        new_q_weight.requires_grad = True
+        new_k_weight.requires_grad = True
+        new_v_weight.requires_grad = True
+        new_out_proj_weight.requires_grad = True
+        new_q_bias = torch.cat(new_q_bias).detach()
+        new_q_bias.requires_grad = True
+        new_k_bias = torch.cat(new_k_bias).detach()
+        new_k_bias.requires_grad = True
+        new_v_bias = torch.cat(new_v_bias).detach()
+        new_v_bias.requires_grad = True
+        self.q_proj.weight = nn.Parameter(new_q_weight)
+        self.q_proj.bias = nn.Parameter(new_q_bias)
+        self.k_proj.weight = nn.Parameter(new_k_weight)
+        self.k_proj.bias = nn.Parameter(new_k_bias)
+        self.v_proj.weight = nn.Parameter(new_v_weight)
+        self.v_proj.bias = nn.Parameter(new_v_bias)
+        self.out_proj.weight = nn.Parameter(new_out_proj_weight)
+        self.num_heads = len(reserve_head_index)
+        self.embed_dim = self.head_dim * self.num_heads
+        self.q_proj.out_features = self.embed_dim
+        self.k_proj.out_features = self.embed_dim
+        self.v_proj.out_features = self.embed_dim
+    def _set_skip_embed_dim_check(self):
+        self.skip_embed_dim_check = True
+    def _pad_masks(self, key_padding_mask, attn_mask):
+        if attn_mask is not None:
+            shape = attn_mask.size()[:-1] + torch.Size([1])
+            attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(shape)], dim=-1)
+        if key_padding_mask is not None:
+            shape = key_padding_mask.size()[:-1] + torch.Size([1])
+            key_padding_mask = torch.cat([key_padding_mask, key_padding_mask.new_zeros(shape)], dim=-1)
+        return key_padding_mask, attn_mask
+    def _add_bias(self, k, v, key_padding_mask, attn_mask, bsz):
+        assert self.bias_k is not None or self.bias_v is not None
+        key_padding_mask, attn_mask = self._pad_masks(key_padding_mask=key_padding_mask, attn_mask=attn_mask)
+        return torch.cat([k, self.bias_k.repeat(1, bsz, 1)]), torch.cat([v, self.bias_v.repeat(1, bsz, 1)]), key_padding_mask, attn_mask
+    def _append_zero_attn(self, k, v, key_padding_mask, attn_mask):
+        zero_attn_shape = k.size()[:-2] + torch.Size([1]) + k.size()[-1:]
+        key_padding_mask, attn_mask = self._pad_masks(key_padding_mask=key_padding_mask, attn_mask=attn_mask)
+        return torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=-2), torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=-2), key_padding_mask, attn_mask
+    def forward(self, query, key, value, key_padding_mask = None, incremental_state = None, need_weights = True, static_kv = False, attn_mask = None, before_softmax = False, need_head_weights = False):
+        if need_head_weights: need_weights = True
+        is_tpu = query.device.type == "xla"
+        tgt_len, bsz, embed_dim = query.size()
+        src_len = tgt_len
+        if not self.skip_embed_dim_check: assert (embed_dim == self.embed_dim)
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                assert value is not None
+                assert src_len, key_bsz == value.shape[:2]
+        if (not self.onnx_trace and not is_tpu and incremental_state is None and not static_kv and not torch.jit.is_scripting() and not self.skip_embed_dim_check):
+            assert key is not None and value is not None
+            return F.multi_head_attention_forward(query, key, value, self.embed_dim, self.num_heads, torch.empty([0]), torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), self.bias_k, self.bias_v, self.add_zero_attn, self.dropout_module.p, self.out_proj.weight, self.out_proj.bias, self.training or self.dropout_module.apply_during_inference, key_padding_mask.bool() if key_padding_mask is not None else None, need_weights, attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj.weight, k_proj_weight=self.k_proj.weight, v_proj_weight=self.v_proj.weight)
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else: saved_state = None
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                if self.beam_size > 1 and bsz == key.size(1):
+                    key = key.view(key.size(0), -1, self.beam_size, key.size(2))[:, :, 0, :]
+                    if key_padding_mask is not None: key_padding_mask = key_padding_mask.view(-1, self.beam_size, key_padding_mask.size(1))[:, 0, :]
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k, v, attn_mask, key_padding_mask = self._add_bias(k, v, attn_mask, key_padding_mask, bsz)
+        q = (q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1))
+        kv_bsz = bsz
+        if k is not None:
+            kv_bsz = k.size(1)
+            k = (k.contiguous().view(-1, kv_bsz * self.num_heads, self.head_dim).transpose(0, 1))
+        if v is not None: v = (v.contiguous().view(-1, kv_bsz * self.num_heads, self.head_dim).transpose(0, 1))
+        if saved_state is not None:
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                kv_bsz = _prev_key.size(0)
+                prev_key = _prev_key.view(kv_bsz * self.num_heads, -1, self.head_dim)
+                if static_kv: k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+                src_len = k.size(1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None or kv_bsz == _prev_value.size(0)
+                prev_value = _prev_value.view(kv_bsz * self.num_heads, -1, self.head_dim)
+                if static_kv: v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask = None
+            if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(key_padding_mask=key_padding_mask, prev_key_padding_mask=prev_key_padding_mask, batch_size=kv_bsz, src_len=k.size(1), static_kv=static_kv)
+            saved_state["prev_key"] = k.view(kv_bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(kv_bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        assert k.size(1) == src_len
+        if key_padding_mask is not None and key_padding_mask.dim() == 0: key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == kv_bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k, v, key_padding_mask, attn_mask = self._append_zero_attn(k=k, v=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask)
+        if self.encoder_decoder_attention and bsz != kv_bsz:
+            attn_weights = torch.einsum("bxhtd,bhsd->bxhts", q.view((kv_bsz, -1, self.num_heads) + q.size()[1:]), k.view((kv_bsz, self.num_heads) + k.size()[1:]))
+            attn_weights = attn_weights.reshape((-1,) + attn_weights.size()[-2:])
+        else: attn_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+        if key_padding_mask is not None:
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(kv_bsz, -1, self.num_heads, tgt_len, src_len).masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2).unsqueeze(3).to(torch.bool), float("-inf")) if not is_tpu else attn_weights.transpose(0, 2).masked_fill(key_padding_mask, float("-inf")).transpose(0, 2)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if before_softmax: return attn_weights, v
+        attn_weights_float = softmax(attn_weights, dim=-1, onnx_trace=self.onnx_trace)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+        assert v is not None
+        attn = None
+        if self.encoder_decoder_attention and bsz != kv_bsz:
+            attn = torch.einsum("bxhts,bhsd->bxhtd", attn_probs.view((kv_bsz, -1, self.num_heads) + attn_probs.size()[1:]), v.view((kv_bsz, self.num_heads) + v.size()[1:]))
+            attn = attn.reshape((-1,) + attn.size()[-2:])
+        else: attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        attn = attn.contiguous().view(tgt_len, bsz, self.embed_dim) if self.onnx_trace and attn.size(1) == 1 else attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            if not need_head_weights: attn_weights = attn_weights.mean(dim=0)
+        return attn, attn_weights
+    @staticmethod
+    def _append_prev_key_padding_mask(key_padding_mask, prev_key_padding_mask, batch_size, src_len, static_kv):
+        if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None: new_key_padding_mask = torch.cat([prev_key_padding_mask.float(), key_padding_mask.float()], dim=1)
+        elif prev_key_padding_mask is not None:
+            if src_len > prev_key_padding_mask.size(1):
+                filler = torch.zeros((batch_size, src_len - prev_key_padding_mask.size(1)), device=prev_key_padding_mask.device)
+                new_key_padding_mask = torch.cat([prev_key_padding_mask.float(), filler.float()], dim=1)
+            else: new_key_padding_mask = prev_key_padding_mask.float()
+        elif key_padding_mask is not None:
+            if src_len > key_padding_mask.size(1):
+                filler = torch.zeros((batch_size, src_len - key_padding_mask.size(1)), device=key_padding_mask.device)
+                new_key_padding_mask = torch.cat([filler.float(), key_padding_mask.float()], dim=1)
+            else: new_key_padding_mask = key_padding_mask.float()
+        else: new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+    @torch.jit.export
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                input_buffer_k = input_buffer[k]
+                if input_buffer_k is not None:
+                    if self.encoder_decoder_attention:
+                        if input_buffer_k.size(0) * self.beam_size == new_order.size(0): return incremental_state
+                        elif self.beam_size > 1: input_buffer[k] = input_buffer_k.index_select(0, new_order.reshape(-1, self.beam_size)[:, 0] // self.beam_size)
+                        else: input_buffer[k] = input_buffer_k.index_select(0, new_order)
+                    else: input_buffer[k] = input_buffer_k.index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+    def set_beam_size(self, beam_size):
+        self.beam_size = beam_size
+    def _get_input_buffer(self, incremental_state):
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        return result if result is not None else {}
+    def _set_input_buffer(self, incremental_state, buffer):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add, keys_to_remove = {}, []
+        for k in state_dict.keys():
+            if k.endswith(prefix + "in_proj_weight"):
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
+                keys_to_remove.append(k)
+                k_bias = prefix + "in_proj_bias"
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][dim : 2 * dim]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
+                    keys_to_remove.append(prefix + "in_proj_bias")
+        for k in keys_to_remove:
+            del state_dict[k]
+        for key, value in items_to_add.items():
+            state_dict[key] = value
+def init_bert_params(module):
+    def normal_(data):
+        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+    if isinstance(module, nn.Linear):
+        normal_(module.weight.data)
+        if module.bias is not None: module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        normal_(module.weight.data)
+        if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_()
+    if isinstance(module, MultiheadAttention):
+        normal_(module.q_proj.weight.data)
+        normal_(module.k_proj.weight.data)
+        normal_(module.v_proj.weight.data)
+def make_conv_pos(e, k, g):
+    pos_conv = nn.Conv1d(e, e, kernel_size=k, padding=k // 2, groups=g)
+    dropout = 0
+    nn.init.normal_(pos_conv.weight, mean=0, std=math.sqrt((4 * (1.0 - dropout)) / (k * e)))
+    nn.init.constant_(pos_conv.bias, 0)
+    return nn.Sequential(nn.utils.parametrizations.weight_norm(pos_conv, name="weight", dim=2), SamePad(k), nn.GELU())
+def is_xla_tensor(tensor):
+    return torch.is_tensor(tensor) and tensor.device.type == "xla"
+def index_put(tensor, indices, value):
+    if is_xla_tensor(tensor):
+        for _ in range(indices.dim(), tensor.dim()):
+            indices = indices.unsqueeze(-1)
+        if indices.size(-1) < tensor.size(-1): indices = indices.expand_as(tensor)
+        tensor = torch.mul(tensor, ~indices) + torch.mul(value, indices)
+    else: tensor[indices] = value
+    return tensor
+def pad_to_multiple(x, multiple, dim=-1, value=0):
+    if x is None: return None, 0
+    tsz = x.size(dim)
+    m = tsz / multiple
+    remainder = math.ceil(m) * multiple - tsz
+    if m.is_integer(): return x, 0
+    return F.pad(x, (*((0,) * (-1 - dim) * 2), 0, remainder), value=value), remainder
+def compute_mask_indices(shape, padding_mask, mask_prob, mask_length, mask_type = "static", mask_other = 0.0, min_masks = 0, no_overlap = False, min_space = 0, require_same_masks = True, mask_dropout = 0.0, add_masks = False, seed = None, epoch = None, indices = None, idc_select_ver = 1, num_mask_ver = 2):
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    if num_mask_ver == 1: all_num_mask = max(min_masks, int(mask_prob * all_sz / float(mask_length) + np.random.rand()))
+    mask_idcs = []
+    for i in range(bsz):
+        seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6) if seed is not None and epoch is not None and indices is not None else None
+        rng = np.random.default_rng(seed_i)
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            assert sz >= 0, sz
+        else: sz = all_sz
+        if num_mask_ver == 1: num_mask = max(min_masks, int(mask_prob * sz / float(mask_length) + np.random.rand())) if padding_mask is not None else all_num_mask
+        elif num_mask_ver == 2: num_mask = max(min_masks, int(mask_prob * sz / float(mask_length) + rng.random()))
+        else: raise ValueError
+        if mask_type == "static": lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform": lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal": lengths = [max(1, int(round(x))) for x in rng.normal(mask_length, mask_other, size=num_mask)]
+        elif mask_type == "poisson": lengths = [int(round(x)) for x in rng.poisson(mask_length, size=num_mask)]
+        else: raise Exception
+        if sum(lengths) == 0:
+            if mask_type == "static": raise ValueError
+            else: lengths = [min(mask_length, sz - 1)]
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = rng.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length: new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - length - min_space > keep_length: new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter((e - s if e - s >= length + min_space else 0 for s, e in parts), np.int32)
+                l_sum = np.sum(lens)
+                if l_sum == 0: break
+                s, e = parts.pop(rng.choice(len(parts), p=lens / np.sum(lens)))
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            if idc_select_ver == 1:
+                min_len = min(lengths)
+                if sz - min_len <= num_mask: min_len = sz - num_mask - 1
+                mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
+            elif idc_select_ver == 2: mask_idc = rng.choice(sz, num_mask, replace=False)
+            else: raise ValueError
+            mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
+        mask_idc = np.unique(mask_idc[mask_idc < sz])
+        if len(mask_idc) >= sz: raise ValueError
+        mask_idcs.append(mask_idc)
+    target_len = None
+    if require_same_masks: target_len = max([len(m) for m in mask_idcs]) if add_masks else min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if target_len is not None and len(mask_idc) > target_len: mask_idc = rng.choice(mask_idc, target_len, replace=False)
+        mask[i, mask_idc] = True
+        if target_len is not None and len(mask_idc) < target_len:
+            to_mask = rng.choice(np.flatnonzero(~mask[i]), target_len - len(mask_idc), replace=False)
+            mask[i, to_mask] = True
+        if mask_dropout > 0:
+            masked = np.flatnonzero(mask[i])
+            mask[i, rng.choice(masked, np.rint(len(masked) * mask_dropout).astype(int), replace=False)] = False
+    return mask
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
+    return nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+def prune_state_dict(state_dict, model_cfg):
+    arch = None
+    if model_cfg is not None: arch = (model_cfg._name if isinstance(model_cfg, DictConfig) else getattr(model_cfg, "arch", None))
+    if not model_cfg or arch is None or arch == "ptt_transformer": return state_dict
+    encoder_layers_to_keep = getattr(model_cfg, "encoder_layers_to_keep", None)
+    decoder_layers_to_keep = getattr(model_cfg, "decoder_layers_to_keep", None)
+    if not encoder_layers_to_keep and not decoder_layers_to_keep: return state_dict
+    def create_pruning_pass(layers_to_keep, layer_name):
+        keep_layers = sorted(int(layer_string) for layer_string in layers_to_keep.split(","))
+        mapping_dict = {}
+        for i in range(len(keep_layers)):
+            mapping_dict[str(keep_layers[i])] = str(i)
+        return {"substitution_regex": re.compile(r"^{layer}.*\.layers\.(\d+)".format(layer=layer_name)), "mapping_dict": mapping_dict}
+    pruning_passes, new_state_dict = [], {}
+    if encoder_layers_to_keep: pruning_passes.append(create_pruning_pass(encoder_layers_to_keep, "encoder"))
+    if decoder_layers_to_keep: pruning_passes.append(create_pruning_pass(decoder_layers_to_keep, "decoder"))
+    for layer_name in state_dict.keys():
+        match = re.search(r"\.layers\.(\d+)\.", layer_name)
+        if not match:
+            new_state_dict[layer_name] = state_dict[layer_name]
+            continue
+        original_layer_number = match.group(1)
+        for pruning_pass in pruning_passes:
+            if original_layer_number in pruning_pass["mapping_dict"] and pruning_pass["substitution_regex"].search(layer_name):
+                substitution_match = pruning_pass["substitution_regex"].search(layer_name)
+                new_state_dict[(layer_name[: substitution_match.start(1)] + pruning_pass["mapping_dict"][original_layer_number] + layer_name[substitution_match.end(1) :])] = state_dict[layer_name]
+    with open_dict(model_cfg) if isinstance(model_cfg, DictConfig) else contextlib.ExitStack():
+        if hasattr(model_cfg, "encoder_layers_to_keep"): model_cfg.encoder_layers_to_keep = None
+        if hasattr(model_cfg, "decoder_layers_to_keep"): model_cfg.decoder_layers_to_keep = None
+    return new_state_dict
+def relu_squared(x):
+    return F.relu(x).pow(2)
+def get_activation_fn(activation):
+    def gelu(x):
+        return nn.functional.gelu(x.float()).type_as(x)
+    def gelu_accurate(x):
+        if not hasattr(gelu_accurate, "_a"):
+            gelu_accurate._a = math.sqrt(2 / math.pi)
+            return (0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3)))))
+    if activation == "relu": return F.relu
+    elif activation == "relu_squared": return relu_squared
+    elif activation == "gelu": return gelu
+    elif activation == "gelu_fast": return gelu_accurate
+    elif activation == "gelu_accurate": return gelu_accurate
+    elif activation == "tanh": return torch.tanh
+    elif activation == "linear": return lambda x: x
+    elif activation == "swish": return nn.SiLU
+    else: raise RuntimeError
+class SamePad(nn.Module):
+    def __init__(self, kernel_size, causal=False):
+        super().__init__()
+        if causal: self.remove = kernel_size - 1
+        else: self.remove = 1 if kernel_size % 2 == 0 else 0
+    def forward(self, x):
+        if self.remove > 0: x = x[:, :, : -self.remove]
+        return x
+class TransformerSentenceEncoderLayer(nn.Module):
+    def __init__(self, embedding_dim = 768, ffn_embedding_dim = 3072, num_attention_heads = 8, dropout = 0.1, attention_dropout = 0.1, activation_dropout = 0.1, activation_fn = "relu", layer_norm_first = False):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+        self.activation_fn = get_activation_fn(activation_fn)
+        self.self_attn = MultiheadAttention(self.embedding_dim, num_attention_heads, dropout=attention_dropout, self_attention=True)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(self.activation_dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.layer_norm_first = layer_norm_first
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
+        self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
+        self.final_layer_norm = LayerNorm(self.embedding_dim)
+    def forward(self, x, self_attn_mask=None, self_attn_padding_mask=None, need_weights=False, att_args=None):
+        residual = x
+        if self.layer_norm_first:
+            x = self.self_attn_layer_norm(x)
+            x, attn = self.self_attn(query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, attn_mask=self_attn_mask, need_weights=False)
+            x = residual + self.dropout1(x)
+            residual = x
+            x = self.fc2(self.dropout2(self.activation_fn(self.fc1(self.final_layer_norm(x)))))
+            layer_result = x
+            x = residual + self.dropout3(x)
+        else:
+            x, attn = self.self_attn(query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, need_weights=False)
+            x = self.self_attn_layer_norm(residual + self.dropout1(x))
+            residual = x
+            x = self.fc2(self.dropout2(self.activation_fn(self.fc1(x))))
+            layer_result = x
+            x = self.final_layer_norm(residual + self.dropout3(x))
+        return x, (attn, layer_result)
+class AdapterFast(nn.Module):
+    def __init__(self, adapter_num, input_dim, hidden_dim, act_fn):
+        super().__init__()
+        self.adapter_num = adapter_num
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.W_a = nn.Parameter(torch.empty(adapter_num, hidden_dim, input_dim))
+        self.W_b = nn.Parameter(torch.empty(adapter_num, input_dim, hidden_dim))
+        self.b_a = nn.Parameter(torch.empty(adapter_num, hidden_dim))
+        self.b_b = nn.Parameter(torch.empty(adapter_num, input_dim))
+        self.ln_W = nn.Parameter(torch.empty(adapter_num, input_dim))
+        self.ln_b = nn.Parameter(torch.empty(adapter_num, input_dim))
+        self.act_fn = nn.Identity()
+        if act_fn == "relu": self.act_fn = nn.ReLU()
+        elif act_fn == "gelu": self.act_fn = nn.GELU()
+        elif act_fn == "selu": self.act_fn = nn.SELU()
+        else: raise ValueError
+        self.input_dim = input_dim
+        self.reset_parameters()
+    def reset_parameters(self):
+        for ii in range(self.adapter_num):
+            nn.init.kaiming_uniform_(self.W_a[ii], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.W_b[ii], a=math.sqrt(5))
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_a[ii])
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.b_a[ii], -bound, bound)
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_b[ii])
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.b_b[ii], -bound, bound)
+        nn.init.ones_(self.ln_W)
+        nn.init.zeros_(self.ln_b)
+    def forward(self, x, adapter_id):
+        ii = adapter_id
+        return F.linear(self.act_fn(F.linear(F.layer_norm(x, (self.input_dim, ), self.ln_W[ii], self.ln_b[ii]), self.W_a[ii], self.b_a[ii])), self.W_b[ii], self.b_b[ii])
+    def extra_repr(self):
+        return ('adapter={}, input_dim={}, hidden_dim={}'.format(self.adapter_num, self.input_dim, self.hidden_dim))
+class FeedForwardModule(nn.Module):
+    def __init__(self, input_feat, hidden_units, dropout1, dropout2, activation_fn="swish", bias=True):
+        super(FeedForwardModule, self).__init__()
+        self.layer_norm = LayerNorm(input_feat)
+        self.w_1 = nn.Linear(input_feat, hidden_units, bias=bias)
+        self.w_2 = nn.Linear(hidden_units, input_feat, bias=bias)
+        self.dropout1 = nn.Dropout(dropout1)
+        self.dropout2 = nn.Dropout(dropout2)
+        self.activation = get_activation_fn(activation_fn)(hidden_units)
+    def forward(self, x):
+        return self.dropout2(self.w_2(self.dropout1(self.activation(self.w_1(self.layer_norm(x))))))
+class ConvolutionModule(nn.Module):
+    def __init__(self, embed_dim, channels, depthwise_kernel_size, dropout, activation_fn="swish", bias=False, export=False):
+        super(ConvolutionModule, self).__init__()
+        assert (depthwise_kernel_size - 1) % 2 == 0
+        self.layer_norm = LayerNorm(embed_dim, export=export)
+        self.pointwise_conv1 = nn.Conv1d(embed_dim, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias)
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(channels, channels, depthwise_kernel_size, stride=1, padding=(depthwise_kernel_size - 1) // 2, groups=channels, bias=bias)
+        self.batch_norm = nn.BatchNorm1d(channels)
+        self.activation = get_activation_fn(activation_fn)(channels)
+        self.pointwise_conv2 = nn.Conv1d(channels, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.dropout(self.pointwise_conv2(self.activation(self.batch_norm(self.depthwise_conv(self.glu(self.pointwise_conv1(self.layer_norm(x).transpose(1, 2)))))))).transpose(1, 2)
+def rotate_half(x):
+    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=x1.ndim - 1)
+def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
+    cos, sin = (cos[offset : q.shape[0] + offset, ...], sin[offset : q.shape[0] + offset, ...])
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+class RotaryPositionalEmbedding(nn.Module):
+    def __init__(self, dim, base=10000, precision=torch.half):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.seq_len_cached = 0
+        self.cos_cached = torch.empty(self.seq_len_cached, 1, 1, dim)
+        self.sin_cached = torch.empty(self.seq_len_cached, 1, 1, dim)
+        self.precision = precision
+    def forward(self, x, seq_len = 0):
+        if seq_len > self.seq_len_cached:
+            self.seq_len_cached = seq_len
+            freqs = torch.einsum("i,j->ij", torch.arange(seq_len, device=x.device).type_as(self.inv_freq), self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.cos_cached = emb.cos().view(emb.size(0), 1, 1, emb.size(1))
+            self.sin_cached = emb.sin().view(emb.size(0), 1, 1, emb.size(1))
+        return self.cos_cached, self.sin_cached
+class ESPNETMultiHeadedAttention(nn.Module):
+    def __init__(self, n_feat, n_head, dropout):
+        super(ESPNETMultiHeadedAttention, self).__init__()
+        assert n_feat % n_head == 0
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout)
+    def forward_qkv(self, query, key, value, **kwargs):
+        n_batch = query.size(0)
+        return self.linear_q(query).view(n_batch, -1, self.h, self.d_k).transpose(1, 2), self.linear_k(key).view(n_batch, -1, self.h, self.d_k).transpose(1, 2), self.linear_v(value).view(n_batch, -1, self.h, self.d_k).transpose(1, 2)
+    def forward_attention(self, value, scores, mask):
+        n_batch = value.size(0)
+        if mask is not None:
+            scores = scores.masked_fill(mask.unsqueeze(1).unsqueeze(2).to(bool), float("-inf"))
+            self.attn = torch.softmax(scores, dim=-1)
+        else: self.attn = torch.softmax(scores, dim=-1)
+        return self.linear_out((torch.matmul(self.dropout(self.attn), value).transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)))
+    def forward(self, query, key, value, key_padding_mask=None, **kwargs):
+        q, k, v = self.forward_qkv(query.transpose(0, 1), key.transpose(0, 1), value.transpose(0, 1))
+        return self.forward_attention(v, torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k), key_padding_mask).transpose(0, 1), None
+class RelPositionMultiHeadedAttention(ESPNETMultiHeadedAttention):
+    def __init__(self, n_feat, n_head, dropout, zero_triu=False):
+        super().__init__(n_feat, n_head, dropout)
+        self.zero_triu = zero_triu
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        self.pos_bias_u = nn.Parameter(torch.zeros(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.zeros(self.h, self.d_k))
+        nn.init.xavier_uniform_(self.pos_bias_u)
+        nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x):
+        x = torch.cat([torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype), x], dim=-1).view(*x.size()[:2], x.size(3) + 1, x.size(2))[:, :, 1:].view_as(x)[:, :, :, : x.size(-1) // 2 + 1]
+        if self.zero_triu: x = x * torch.tril(torch.ones((x.size(2), x.size(3)), device=x.device), x.size(3) - x.size(2))[None, None, :, :]
+        return x
+    def forward(self, query, key, value, pos_emb, key_padding_mask=None, **kwargs):
+        pos_emb = pos_emb.transpose(0, 1)
+        q, k, v = self.forward_qkv(query.transpose(0, 1), key.transpose(0, 1), value.transpose(0, 1))
+        q = q.transpose(1, 2)
+        return self.forward_attention(v, (torch.matmul((q + self.pos_bias_u).transpose(1, 2), k.transpose(-2, -1)) + self.rel_shift(torch.matmul((q + self.pos_bias_v).transpose(1, 2), self.linear_pos(pos_emb).view(pos_emb.size(0), -1, self.h, self.d_k).transpose(1, 2).transpose(-2, -1)))) / math.sqrt(self.d_k), key_padding_mask).transpose(0, 1), None
+class RotaryPositionMultiHeadedAttention(ESPNETMultiHeadedAttention):
+    def __init__(self, n_feat, n_head, dropout, precision, rotary_emd_base=10000):
+        super().__init__(n_feat, n_head, dropout)
+        precision = torch.float
+        self.rotary_ndims = self.d_k
+        if precision == "fp16": precision = torch.half
+        self.rotary_emb = RotaryPositionalEmbedding(self.rotary_ndims, base=rotary_emd_base, precision=precision)
+    def forward(self, query, key, value, key_padding_mask=None, **kwargs):
+        T, B, C = value.size()
+        query = query.view(T, B, self.h, self.d_k)
+        key = key.view(T, B, self.h, self.d_k)
+        value = value.view(T, B, self.h, self.d_k)
+        cos, sin = self.rotary_emb(value, seq_len=T)
+        query, key = apply_rotary_pos_emb(query, key, cos, sin, offset=0)
+        query = query.view(T, B, self.h * self.d_k)
+        key = key.view(T, B, self.h * self.d_k)
+        value = value.view(T, B, self.h * self.d_k)
+        q, k, v = self.forward_qkv(query.transpose(0, 1), key.transpose(0, 1), value.transpose(0, 1))
+        return self.forward_attention(v, torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k), key_padding_mask).transpose(0, 1), None
+class ConformerEncoderLayer(nn.Module):
+    def __init__(self, embed_dim, ffn_embed_dim, attention_heads, dropout, use_fp16, depthwise_conv_kernel_size=31, activation_fn="swish", attn_type=None, pos_enc_type="abs"):
+        self.pos_enc_type = pos_enc_type
+        super(ConformerEncoderLayer, self).__init__()
+        self.ffn1 = FeedForwardModule(embed_dim, ffn_embed_dim, dropout, dropout)
+        self.self_attn_layer_norm = LayerNorm(embed_dim, export=False)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        if attn_type == "espnet":
+            if self.pos_enc_type == "rel_pos": self.self_attn = RelPositionMultiHeadedAttention(embed_dim, attention_heads, dropout=dropout)
+            elif self.pos_enc_type == "rope": self.self_attn = RotaryPositionMultiHeadedAttention(embed_dim, attention_heads, dropout=dropout, precision=use_fp16)
+            elif self.pos_enc_type == "abs": self.self_attn = ESPNETMultiHeadedAttention(embed_dim, attention_heads, dropout=dropout)
+            else: raise Exception
+        else: self.self_attn = MultiheadAttention(embed_dim, attention_heads, dropout=dropout)
+        self.conv_module = ConvolutionModule(embed_dim=embed_dim, channels=embed_dim, depthwise_kernel_size=depthwise_conv_kernel_size, dropout=dropout, activation_fn=activation_fn)
+        self.ffn2 = FeedForwardModule(embed_dim, ffn_embed_dim, dropout, dropout, activation_fn=activation_fn)
+        self.final_layer_norm = LayerNorm(embed_dim, export=False)
+    def forward(self, x, encoder_padding_mask, position_emb = None):
+        residual = x
+        x = self.ffn1(x) * 0.5 + residual
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        if self.pos_enc_type == "rel_pos": x, attn = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, pos_emb=position_emb, need_weights=False)
+        else: x, attn = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, need_weights=False)
+        x = self.self_attn_dropout(x)
+        x = x + residual
+        residual = x
+        x = residual + self.conv_module(x.transpose(0, 1)).transpose(0, 1)
+        residual = x
+        x = self.ffn2(x)
+        layer_result = x
+        x = self.final_layer_norm(x * 0.5 + residual)
+        return x, (attn, layer_result)
+class ConformerWav2Vec2EncoderLayer(ConformerEncoderLayer):
+    def forward(self, x, self_attn_mask=None, self_attn_padding_mask=None, need_weights=False, att_args=None, position_emb=None):
+        return super().forward(x, self_attn_padding_mask, position_emb)
+class TransformerSentenceEncoderWithAdapterLayer(TransformerSentenceEncoderLayer):
+    def __init__(self, embedding_dim = 768, ffn_embedding_dim = 3072, num_attention_heads = 8, dropout = 0.1, attention_dropout = 0.1, activation_dropout = 0.1, activation_fn = "relu", layer_norm_first = False, adapter_num=201, adapter_dim=64, adapter_act_fn="relu"):
+        super().__init__(embedding_dim=embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, layer_norm_first=layer_norm_first)
+        self.adapter_num = adapter_num
+        self.adapter_dim = adapter_dim
+        self.adapter_layer = AdapterFast(adapter_num, self.embedding_dim, self.adapter_dim, adapter_act_fn)
+    def forward(self, x, self_attn_mask=None, self_attn_padding_mask=None, need_weights=False, att_args=None, corpus_key=None):
+        x, (attn, layer_result) = super().forward(x=x, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_weights=need_weights, att_args=att_args)
+        assert corpus_key is not None
+        assert len(set(corpus_key)) == 1
+        return x + self.adapter_layer(x, corpus_key[0]), (attn, layer_result)
+class TransposeLast(nn.Module):
+    def __init__(self, deconstruct_idx=None, tranpose_dim=-2):
+        super().__init__()
+        self.deconstruct_idx = deconstruct_idx
+        self.tranpose_dim = tranpose_dim
+    def forward(self, x):
+        if self.deconstruct_idx is not None: x = x[self.deconstruct_idx]
+        return x.transpose(self.tranpose_dim, -1)
+class TransformerEncoder(nn.Module):
+    def build_encoder_layer(self, args, **kwargs):
+        if args.layer_type == "transformer": layer = TransformerSentenceEncoderLayer(embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first)
+        elif args.layer_type == "conformer": layer = ConformerWav2Vec2EncoderLayer(embed_dim=self.embedding_dim, ffn_embed_dim=args.encoder_ffn_embed_dim, attention_heads=args.encoder_attention_heads, dropout=args.dropout, depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, activation_fn="swish", attn_type=args.attn_type, use_fp16=args.fp16, pos_enc_type="abs")
+        elif args.layer_type == "trf_adp":
+            use_adp = False
+            if args.adp_trf_idx == "all": use_adp = True
+            else:
+                if kwargs.get("layer_idx", None) in list(range(*[int(g) for g in args.adp_trf_idx.split(":")])): use_adp = True
+            layer = TransformerSentenceEncoderWithAdapterLayer(embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, adapter_num=args.adp_num, adapter_dim=args.adp_dim, adapter_act_fn=args.adp_act_fn) if use_adp else TransformerSentenceEncoderLayer(embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first,)
+        return layer
+    def __init__(self, args):
+        super().__init__()
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+        self.required_seq_len_multiple = args.required_seq_len_multiple
+        pos_conv_depth = getattr(args, "pos_conv_depth", 1)
+        if pos_conv_depth > 1:
+            num_layers = args.pos_conv_depth
+            k = max(3, args.conv_pos // num_layers)
+            def make_conv_block(e, k, g, l):
+                return nn.Sequential(*[nn.Sequential(nn.Conv1d(e, e, kernel_size=k, padding=k // 2, groups=g), SamePad(k), TransposeLast(), LayerNorm(e, elementwise_affine=False), TransposeLast(), nn.GELU()) for _ in range(l)])
+            self.pos_conv = make_conv_block(self.embedding_dim, k, args.conv_pos_groups, num_layers)
+        else: self.pos_conv = make_conv_pos(self.embedding_dim, args.conv_pos, args.conv_pos_groups)
+        self.layers = nn.ModuleList([self.build_encoder_layer(args, layer_idx=ii) for ii in range(args.encoder_layers)])
+        self.layer_norm_first = args.layer_norm_first
+        self.layer_norm = LayerNorm(self.embedding_dim)
+        self.layerdrop = args.encoder_layerdrop
+        self.apply(init_bert_params)
+    def forward(self, x, padding_mask=None, layer=None, corpus_key=None):
+        x, layer_results = self.extract_features(x, padding_mask, layer, corpus_key=corpus_key)
+        if self.layer_norm_first and layer is None: x = self.layer_norm(x)
+        return x, layer_results
+    def extract_features(self, x, padding_mask=None, tgt_layer=None, min_layer=0, corpus_key=None):
+        if padding_mask is not None: x = index_put(x, padding_mask, 0)
+        x = x + self.pos_conv(x.transpose(1, 2)).transpose(1, 2)
+        if not self.layer_norm_first: x = self.layer_norm(x)
+        x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0)
+        if pad_length > 0 and padding_mask is None:
+            padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool)
+            padding_mask[:, -pad_length:] = True
+        else: padding_mask, _ = pad_to_multiple(padding_mask, self.required_seq_len_multiple, dim=-1, value=True)
+        x = F.dropout(x, p=self.dropout, training=self.training).transpose(0, 1)
+        layer_results = []
+        r = None
+        for i, layer in enumerate(self.layers):
+            dropout_probability = np.random.random() if self.layerdrop > 0 else 1
+            if not self.training or (dropout_probability > self.layerdrop):
+                layer_check = layer
+                if (corpus_key is None) or (not isinstance(layer_check, (TransformerSentenceEncoderWithAdapterLayer))): x, (z, lr) = layer(x, self_attn_padding_mask=padding_mask, need_weights=False)
+                else: x, (z, lr) = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, corpus_key=corpus_key)
+                if i >= min_layer: layer_results.append((x, z, lr))
+            if i == tgt_layer:
+                r = x
+                break
+        if r is not None: x = r
+        x = x.transpose(0, 1)
+        if pad_length > 0:
+            x = x[:, :-pad_length]
+            def undo_pad(a, b, c):
+                return (a[:-pad_length], b[:-pad_length] if b is not None else b, c[:-pad_length])
+            layer_results = [undo_pad(*u) for u in layer_results]
+        return x, layer_results
+    def max_positions(self):
+        return self.args.max_positions
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+class Fp32GroupNorm(nn.GroupNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, input):
+        output = F.group_norm(input.float(), self.num_groups, self.weight.float() if self.weight is not None else None, self.bias.float() if self.bias is not None else None, self.eps)
+        return output.type_as(input)
+class Fp32LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, input):
+        output = F.layer_norm(input.float(), self.normalized_shape, self.weight.float() if self.weight is not None else None, self.bias.float() if self.bias is not None else None, self.eps)
+        return output.type_as(input)
+class ConvFeatureExtractionModel(nn.Module):
+    def __init__(self, conv_layers, dropout = 0.0, mode = "default", conv_bias = False):
+        super().__init__()
+        assert mode in {"default", "layer_norm"}
+        def block(n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False):
+            def make_conv():
+                conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
+                nn.init.kaiming_normal_(conv.weight)
+                return conv
+            assert (is_layer_norm and is_group_norm) == False
+            if is_layer_norm: return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.Sequential(TransposeLast(), Fp32LayerNorm(dim, elementwise_affine=True), TransposeLast()), nn.GELU())
+            elif is_group_norm: return nn.Sequential(make_conv(), nn.Dropout(p=dropout), Fp32GroupNorm(dim, dim, affine=True), nn.GELU())
+            else: return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
+        in_d = 1
+        self.conv_layers = nn.ModuleList()
+        for i, cl in enumerate(conv_layers):
+            assert len(cl) == 3
+            (dim, k, stride) = cl
+            self.conv_layers.append(block(in_d, dim, k, stride, is_layer_norm=mode == "layer_norm", is_group_norm=mode == "default" and i == 0, conv_bias=conv_bias))
+            in_d = dim
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        for conv in self.conv_layers:
+            x = conv(x)
+        return x
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        return res
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
+class BaseFairseqModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._is_generation_fast = False
+    def get_targets(self, sample, net_output):
+        return sample["target"]
+    def extract_features(self, *args, **kwargs):
+        return self(*args, **kwargs)
+    def load_state_dict(self, state_dict, strict=True, model_cfg = None, args = None):
+        self.upgrade_state_dict(state_dict)
+        new_state_dict = prune_state_dict(state_dict, model_cfg)
+        return super().load_state_dict(new_state_dict, strict)
+    def upgrade_state_dict(self, state_dict):
+        self.upgrade_state_dict_named(state_dict, "")
+    def upgrade_state_dict_named(self, state_dict, name):
+        assert state_dict is not None
+        def do_upgrade(m, prefix):
+            if len(prefix) > 0: prefix += "."
+            for n, c in m.named_children():
+                name = prefix + n
+                if hasattr(c, "upgrade_state_dict_named"): c.upgrade_state_dict_named(state_dict, name)
+                elif hasattr(c, "upgrade_state_dict"): c.upgrade_state_dict(state_dict)
+                do_upgrade(c, name)
+        do_upgrade(self, name)
+    def make_generation_fast_(self, **kwargs):
+        if self._is_generation_fast: return
+        self._is_generation_fast = True
+        def apply_remove_weight_norm(module):
+            try:
+                nn.utils.remove_weight_norm(module)
+            except (AttributeError, ValueError):
+                return
+        self.apply(apply_remove_weight_norm)
+        def apply_make_generation_fast_(module, prefix):
+            if len(prefix) > 0: prefix += "."
+            base_func = BaseFairseqModel.make_generation_fast_
+            for n, m in module.named_modules():
+                if (m != self and hasattr(m, "make_generation_fast_") and m.make_generation_fast_.__func__ is not base_func): m.make_generation_fast_(name=prefix + n, **kwargs)
+        apply_make_generation_fast_(self, "")
+        self.eval()
+class HubertConfig:
+    def __init__(self, _name, label_rate, encoder_layers_1, logit_temp_ctr, num_negatives, cross_sample_negatives, ctr_layers, extractor_mode = "default", encoder_layers = 12, encoder_embed_dim = 768, encoder_ffn_embed_dim = 3072, encoder_attention_heads = 12, activation_fn = "gelu", layer_type = "transformer", dropout = 0.1, attention_dropout = 0.1, activation_dropout = 0.0, encoder_layerdrop = 0.0, dropout_input = 0.0, dropout_features = 0.0, final_dim = 0, untie_final_proj = False, layer_norm_first = False, conv_feature_layers = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", conv_bias = False, logit_temp = 0.1, target_glu = False, feature_grad_mult = 1.0, mask_length = 10, mask_prob = 0.65, mask_selection = "static", mask_other = 0.0, no_mask_overlap = False, mask_min_space = 1, mask_channel_length = 10, mask_channel_prob = 0.0, mask_channel_selection = "static", mask_channel_other = 0.0, no_mask_channel_overlap = False, mask_channel_min_space = 1, conv_pos = 128, conv_pos_groups = 16, conv_pos_batch_norm = False, latent_temp = (2, 0.5, 0.999995), skip_masked = False, skip_nomask = False, checkpoint_activations = False, required_seq_len_multiple = 2, depthwise_conv_kernel_size = 31, attn_type = "", pos_enc_type = "abs", fp16 = False):
+        self._name = _name
+        self.label_rate = label_rate
+        self.encoder_layers_1 = encoder_layers_1
+        self.logit_temp_ctr = logit_temp_ctr
+        self.num_negatives = num_negatives
+        self.cross_sample_negatives = cross_sample_negatives
+        self.ctr_layers = ctr_layers
+        self.extractor_mode = extractor_mode
+        self.encoder_layers = encoder_layers
+        self.encoder_embed_dim = encoder_embed_dim
+        self.encoder_ffn_embed_dim = encoder_ffn_embed_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.activation_fn = activation_fn
+        self.layer_type = layer_type
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.dropout_input = encoder_layerdrop
+        self.dropout_features = dropout_features
+        self.final_dim = final_dim
+        self.untie_final_proj = untie_final_proj
+        self.layer_norm_first = layer_norm_first
+        self.conv_feature_layers = conv_feature_layers
+        self.conv_bias = conv_bias
+        self.logit_temp = logit_temp
+        self.target_glu = target_glu
+        self.feature_grad_mult = feature_grad_mult
+        self.mask_length = mask_length
+        self.mask_prob = mask_prob
+        self.mask_selection = mask_selection
+        self.mask_other = mask_other
+        self.no_mask_overlap = no_mask_overlap
+        self.mask_min_space = mask_min_space
+        self.mask_channel_length = mask_channel_length
+        self.mask_channel_prob = mask_channel_prob
+        self.mask_channel_selection = mask_channel_selection
+        self.mask_channel_other = mask_channel_other
+        self.no_mask_channel_overlap = no_mask_channel_overlap
+        self.mask_channel_min_space = mask_channel_min_space
+        self.conv_pos = conv_pos
+        self.conv_pos_groups = conv_pos_groups
+        self.conv_pos_batch_norm = conv_pos_batch_norm
+        self.latent_temp = latent_temp
+        self.skip_masked = skip_masked
+        self.skip_nomask = skip_nomask
+        self.checkpoint_activations = checkpoint_activations
+        self.required_seq_len_multiple = required_seq_len_multiple
+        self.depthwise_conv_kernel_size = depthwise_conv_kernel_size
+        self.attn_type = attn_type
+        self.pos_enc_type = pos_enc_type
+        self.fp16 = fp16
+class HubertModel(BaseFairseqModel):
+    def __init__(self, cfg):
+        super().__init__()
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.embed = feature_enc_layers[-1][0]
+        self.feature_extractor = ConvFeatureExtractionModel(conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias)
+        feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers])
+        self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / 16000
+        self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None)
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.logit_temp = cfg.logit_temp
+        self.skip_masked = cfg.skip_masked
+        self.skip_nomask = cfg.skip_nomask
+        final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
+        self.mask_emb = nn.Parameter(torch.FloatTensor(cfg.encoder_embed_dim).uniform_())
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+        self.target_glu = None
+        if cfg.target_glu: self.target_glu = nn.Sequential(nn.Linear(final_dim, final_dim * 2), nn.GLU())
+        self.untie_final_proj = cfg.untie_final_proj
+        self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim)
+        self.num_classes = [504]
+        self.label_embs_concat = nn.Parameter(torch.FloatTensor(sum(self.num_classes), final_dim))
+        nn.init.uniform_(self.label_embs_concat)
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+    def apply_mask(self, x, padding_mask, target_list):
+        B, T, C = x.shape
+        if self.mask_prob > 0:
+            mask_indices = torch.from_numpy(compute_mask_indices((B, T), padding_mask, self.mask_prob, self.mask_length, self.mask_selection, self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, min_space=self.mask_min_space)).to(x.device)
+            x[mask_indices] = self.mask_emb
+        else: mask_indices = None
+        if self.mask_channel_prob > 0: x[(torch.from_numpy(compute_mask_indices((B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space)).to(x.device).unsqueeze(1).expand(-1, T, -1))] = 0
+        return x, mask_indices
+    def compute_nce(self, x, pos, negs):
+        neg_is_pos = (pos == negs).all(-1)
+        logits = torch.cosine_similarity(x.float(), torch.cat([pos.unsqueeze(0), negs], dim=0).float(), dim=-1).type_as(x)
+        logits /= self.logit_temp
+        if neg_is_pos.any(): logits[1:][neg_is_pos] = float("-inf")
+        return logits.transpose(0, 1)
+    def forward_features(self, source):
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(source)
+            if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(source)
+        return features
+    def forward_targets(self, features, target_list):
+        feat_tsz = features.size(2)
+        targ_tsz = min([t.size(1) for t in target_list])
+        if self.feat2tar_ratio * feat_tsz > targ_tsz:
+            feat_tsz = int(targ_tsz / self.feat2tar_ratio)
+            features = features[..., :feat_tsz]
+        return features, [t[:, (torch.arange(feat_tsz).float() * self.feat2tar_ratio).long()] for t in target_list]
+    def forward_padding_mask(self, features, padding_mask):
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0: padding_mask = padding_mask[:, :-extra]
+        return padding_mask.view(padding_mask.size(0), features.size(1), -1).all(-1)
+    def forward(self, source, target_list = None, padding_mask = None, mask = True, features_only = False, output_layer = None):
+        features = self.forward_features(source)
+        if target_list is not None: features, target_list = self.forward_targets(features, target_list)
+        features_pen = features.float().pow(2).mean()
+        features = self.layer_norm(features.transpose(1, 2))
+        unmasked_features = features.clone()
+        if padding_mask is not None: padding_mask = self.forward_padding_mask(features, padding_mask)
+        if self.post_extract_proj is not None: features = self.post_extract_proj(features)
+        features = self.dropout_input(features)
+        unmasked_features = self.dropout_features(unmasked_features)
+        if mask: x, mask_indices = self.apply_mask(features, padding_mask, target_list)
+        else: x, mask_indices = features, None
+        x, _ = self.encoder(x, padding_mask=padding_mask, layer=None if output_layer is None else output_layer - 1)
+        if features_only: return {"x": x, "padding_mask": padding_mask, "features": features}
+        def compute_pred(proj_x, target, label_embs):
+            y = torch.index_select(label_embs, 0, target.long())
+            negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1)
+            if self.target_glu:
+                y = self.target_glu(y)
+                negs = self.target_glu(negs)
+            return self.compute_nce(proj_x, y, negs)
+        label_embs_list = self.label_embs_concat.split(self.num_classes, 0)
+        if not self.skip_masked:
+            masked_indices = torch.logical_and(~padding_mask, mask_indices)
+            proj_x_m = self.final_proj(x[masked_indices])
+            logit_m_list = [compute_pred(proj_x_m, t[masked_indices], label_embs_list[i]) for i, (proj_x_m, t) in enumerate(zip(proj_x_m.chunk(len(target_list), dim=-1) if self.untie_final_proj else [proj_x_m for _ in range(len(target_list))], target_list))]
+        else: logit_m_list = [None for _ in target_list]
+        if not self.skip_nomask:
+            nomask_indices = torch.logical_and(~padding_mask, ~mask_indices)
+            proj_x_u = self.final_proj(x[nomask_indices])
+            logit_u_list = [compute_pred(proj_x_u, t[nomask_indices], label_embs_list[i]) for i, (proj_x_u, t) in enumerate(zip(proj_x_u.chunk(len(target_list), dim=-1) if self.untie_final_proj else [proj_x_u for _ in range(len(target_list))], target_list))]
+        else: logit_u_list = [None for _ in target_list]
+        return {"logit_m_list": logit_m_list, "logit_u_list": logit_u_list, "padding_mask": padding_mask, "features_pen": features_pen}
+    def extract_features(self, source, padding_mask = None, mask = False, ret_conv = False, output_layer = None):
+        res = self.forward(source, padding_mask=padding_mask, mask=mask, features_only=True, output_layer=output_layer)
+        return res["features"] if ret_conv else res["x"], res["padding_mask"]
+    def get_logits(self, net_output, is_masked=True):
+        return [x.float() for x in (net_output["logit_m_list"] if is_masked else net_output["logit_u_list"]) if x is not None]
+    def get_targets(self, net_output, is_masked=True):
+        return [x.new_zeros(x.size(0), dtype=torch.long) for x in self.get_logits(net_output, is_masked)]
+    def get_extra_losses(self, net_output):
+        extra_losses, names = [], []
+        if "features_pen" in net_output:
+            extra_losses.append(net_output["features_pen"])
+            names.append("features_pen")
+        return extra_losses, names
+    def remove_pretraining_modules(self):
+        self.target_glu = None
+        self.final_proj = None

RVC/modules/gdown.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import re
+import sys
+import json
+import codecs
+import tempfile
+import requests
+from urllib.parse import urlparse, parse_qs, unquote
+def parse_url(url):
+    parsed = urlparse(url)
+    is_download_link = parsed.path.endswith("/uc")
+    if not parsed.hostname in ("drive.google.com", "docs.google.com"): return None, is_download_link
+    file_id = parse_qs(parsed.query).get("id", [None])[0]
+    if file_id is None:
+        for pattern in (r"^/file/d/(.*?)/(edit|view)$", r"^/file/u/[0-9]+/d/(.*?)/(edit|view)$", r"^/document/d/(.*?)/(edit|htmlview|view)$", r"^/document/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", r"^/presentation/d/(.*?)/(edit|htmlview|view)$", r"^/presentation/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", r"^/spreadsheets/d/(.*?)/(edit|htmlview|view)$", r"^/spreadsheets/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$"):
+            match = re.match(pattern, parsed.path)
+            if match:
+                file_id = match.group(1)
+                break
+    return file_id, is_download_link
+def get_url_from_gdrive_confirmation(contents):
+    for pattern in (r'href="(\/uc\?export=download[^"]+)', r'href="/open\?id=([^"]+)"', r'"downloadUrl":"([^"]+)'):
+        match = re.search(pattern, contents)
+        if match:
+            url = match.group(1)
+            if pattern == r'href="/open\?id=([^"]+)"': url = (codecs.decode("uggcf://qevir.hfrepbagrag.tbbtyr.pbz/qbjaybnq?vq=", "rot13") + url + "&confirm=t&uuid=" + re.search(r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"', contents).group(1))
+            elif pattern == r'"downloadUrl":"([^"]+)': url = url.replace("\\u003d", "=").replace("\\u0026", "&")
+            else: url = codecs.decode("uggcf://qbpf.tbbtyr.pbz", "rot13") + url.replace("&", "&")
+            return url
+    match = re.search(r'<p class="uc-error-subcaption">(.*)</p>', contents)
+    if match: raise Exception(match.group(1))
+    raise Exception
+def _get_session(use_cookies, return_cookies_file=False):
+    sess = requests.session()
+    sess.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"})
+    cookies_file = os.path.join(os.path.expanduser("~"), ".cache/gdown/cookies.json")
+    if os.path.exists(cookies_file) and use_cookies:
+        with open(cookies_file) as f:
+            for k, v in json.load(f):
+                sess.cookies[k] = v
+    return (sess, cookies_file) if return_cookies_file else sess
+def gdown_download(url=None, id=None, output=None):
+    if not (id is None) ^ (url is None): raise ValueError
+    if id is not None: url = f"{codecs.decode('uggcf://qevir.tbbtyr.pbz/hp?vq=', 'rot13')}{id}"
+    url_origin = url
+    sess, cookies_file = _get_session(use_cookies=True, return_cookies_file=True)
+    gdrive_file_id, is_gdrive_download_link = parse_url(url)
+    if gdrive_file_id:
+        url = f"{codecs.decode('uggcf://qevir.tbbtyr.pbz/hp?vq=', 'rot13')}{gdrive_file_id}"
+        url_origin = url
+        is_gdrive_download_link = True
+    while 1:
+        res = sess.get(url, stream=True, verify=True)
+        if url == url_origin and res.status_code == 500:
+            url = f"{codecs.decode('uggcf://qevir.tbbtyr.pbz/bcra?vq=', 'rot13')}{gdrive_file_id}"
+            continue
+        os.makedirs(os.path.dirname(cookies_file), exist_ok=True)
+        with open(cookies_file, "w") as f:
+            json.dump([(k, v) for k, v in sess.cookies.items() if not k.startswith("download_warning_")], f, indent=2)
+        if "Content-Disposition" in res.headers: break
+        if not (gdrive_file_id and is_gdrive_download_link): break
+        try:
+            url = get_url_from_gdrive_confirmation(res.text)
+        except Exception as e:
+            raise Exception(e)
+    if gdrive_file_id and is_gdrive_download_link:
+        content_disposition = unquote(res.headers["Content-Disposition"])
+        filename_from_url = (re.search(r"filename\*=UTF-8''(.*)", content_disposition) or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition)).group(1).replace(os.path.sep, "_")
+    else: filename_from_url = os.path.basename(url)
+    output = os.path.join(output or ".", filename_from_url)
+    tmp_file = tempfile.mktemp(suffix=tempfile.template, prefix=os.path.basename(output), dir=os.path.dirname(output))
+    f = open(tmp_file, "ab")
+    if tmp_file is not None and f.tell() != 0: res = sess.get(url, headers={"Range": f"bytes={f.tell()}-"}, stream=True, verify=True)
+    print("To:", os.path.abspath(output), file=sys.stderr)
+    try:
+        for chunk in res.iter_content(chunk_size=512 * 1024):
+            f.write(chunk)
+            if tmp_file: f.close()
+    finally:
+        os.rename(tmp_file, output)
+        sess.close()
+    return output

RVC/modules/generator.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import os
+import sys
+import math
+import torch
+import parselmouth
+import numba as nb
+import numpy as np
+from librosa import yin, pyin
+from scipy.signal import medfilt
+sys.path.append(os.getcwd())
+from modules.rmvpe import RMVPE
+from modules.utils import Autotune
+from modules.torchfcpe import FCPE
+from modules.pyworld import PYWORLD
+from modules.swipe import swipe, stonemask
+from modules.torchcrepe import CREPE, mean, median
+@nb.jit(nopython=True)
+def post_process(f0, f0_up_key, f0_mel_min, f0_mel_max):
+    f0 = np.multiply(f0, pow(2, f0_up_key / 12))
+    f0_mel = 1127 * np.log(1 + f0 / 700)
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > 255] = 255
+    return np.rint(f0_mel).astype(np.int32), f0
+class Generator:
+    def __init__(self, sample_rate = 16000, hop_length = 160, f0_min = 50, f0_max = 1100, is_half = False, device = "cpu"):
+        self.sample_rate = sample_rate
+        self.hop_length = hop_length
+        self.f0_min = f0_min
+        self.f0_max = f0_max
+        self.is_half = is_half
+        self.device = device
+        self.window = 160
+        self.ref_freqs = [49.00, 51.91, 55.00, 58.27, 61.74, 65.41, 69.30, 73.42, 77.78, 82.41, 87.31, 92.50, 98.00, 103.83, 110.00, 116.54, 123.47, 130.81, 138.59, 146.83, 155.56, 164.81, 174.61, 185.00, 196.00,  207.65, 220.00, 233.08, 246.94, 261.63, 277.18, 293.66, 311.13, 329.63, 349.23, 369.99, 392.00, 415.30, 440.00, 466.16, 493.88, 523.25, 554.37, 587.33, 622.25, 659.25, 698.46, 739.99, 783.99, 830.61, 880.00, 932.33, 987.77, 1046.50]
+        self.autotune = Autotune(self.ref_freqs)
+        self.note_dict = self.autotune.note_dict
+    def calculator(self, f0_method, x, f0_up_key = 0, p_len = None, filter_radius = 3, f0_autotune = False, f0_autotune_strength = 1):
+        if p_len is None: p_len = x.shape[0] // self.window
+        f0 = self.compute_f0(f0_method, x, p_len, filter_radius if filter_radius % 2 != 0 else filter_radius + 1)
+        if isinstance(f0, tuple): f0 = f0[0]
+        if f0_autotune: f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength)
+        return post_process(
+            f0,
+            f0_up_key,
+            1127 * math.log(1 + self.f0_min / 700),
+            1127 * math.log(1 + self.f0_max / 700),
+        )
+    def _resize_f0(self, x, target_len):
+        source = np.array(x)
+        source[source < 0.001] = np.nan
+        return np.nan_to_num(
+            np.interp(
+                np.arange(0, len(source) * target_len, len(source)) / target_len,
+                np.arange(0, len(source)),
+                source
+            )
+        )
+    def compute_f0(self, f0_method, x, p_len, filter_radius):
+        return {
+            "pm": lambda: self.get_f0_pm(x, p_len),
+            "dio": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "dio"),
+            "mangio-crepe-tiny": lambda: self.get_f0_mangio_crepe(x, p_len, "tiny"),
+            "mangio-crepe-small": lambda: self.get_f0_mangio_crepe(x, p_len, "small"),
+            "mangio-crepe-medium": lambda: self.get_f0_mangio_crepe(x, p_len, "medium"),
+            "mangio-crepe-large": lambda: self.get_f0_mangio_crepe(x, p_len, "large"),
+            "mangio-crepe-full": lambda: self.get_f0_mangio_crepe(x, p_len, "full"),
+            "crepe-tiny": lambda: self.get_f0_crepe(x, p_len, "tiny"),
+            "crepe-small": lambda: self.get_f0_crepe(x, p_len, "small"),
+            "crepe-medium": lambda: self.get_f0_crepe(x, p_len, "medium"),
+            "crepe-large": lambda: self.get_f0_crepe(x, p_len, "large"),
+            "crepe-full": lambda: self.get_f0_crepe(x, p_len, "full"),
+            "fcpe": lambda: self.get_f0_fcpe(x, p_len),
+            "fcpe-legacy": lambda: self.get_f0_fcpe(x, p_len, legacy=True),
+            "rmvpe": lambda: self.get_f0_rmvpe(x, p_len),
+            "rmvpe-legacy": lambda: self.get_f0_rmvpe(x, p_len, legacy=True),
+            "harvest": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "harvest"),
+            "yin": lambda: self.get_f0_yin(x, p_len, mode="yin"),
+            "pyin": lambda: self.get_f0_yin(x, p_len, mode="pyin"),
+            "swipe": lambda: self.get_f0_swipe(x, p_len)
+        }[f0_method]()
+    def get_f0_pm(self, x, p_len):
+        f0 = (
+            parselmouth.Sound(
+                x,
+                self.sample_rate
+            ).to_pitch_ac(
+                time_step=160 / self.sample_rate * 1000 / 1000,
+                voicing_threshold=0.6,
+                pitch_floor=self.f0_min,
+                pitch_ceiling=self.f0_max
+            ).selected_array["frequency"]
+        )
+        pad_size = (p_len - len(f0) + 1) // 2
+        if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
+        return f0
+    def get_f0_mangio_crepe(self, x, p_len, model="full"):
+        if not hasattr(self, "mangio_crepe"):
+            self.mangio_crepe = CREPE(
+                os.path.join(
+                    "models",
+                    f"crepe_{model}.pth"
+                ),
+                model_size=model,
+                hop_length=self.hop_length,
+                batch_size=self.hop_length * 2,
+                f0_min=self.f0_min,
+                f0_max=self.f0_max,
+                device=self.device,
+                sample_rate=self.sample_rate,
+                return_periodicity=False
+            )
+        x = x.astype(np.float32)
+        x /= np.quantile(np.abs(x), 0.999)
+        audio = torch.unsqueeze(torch.from_numpy(x).to(self.device, copy=True), dim=0)
+        if audio.ndim == 2 and audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True).detach()
+        f0 = self.mangio_crepe.compute_f0(audio.detach(), pad=True)
+        return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len)
+    def get_f0_crepe(self, x, p_len, model="full"):
+        if not hasattr(self, "crepe"):
+            self.crepe = CREPE(
+                os.path.join(
+                    "models",
+                    f"crepe_{model}.pth"
+                ),
+                model_size=model,
+                hop_length=self.hop_length,
+                batch_size=512,
+                f0_min=self.f0_min,
+                f0_max=self.f0_max,
+                device=self.device,
+                sample_rate=self.sample_rate,
+                return_periodicity=True
+            )
+        f0, pd = self.crepe.compute_f0(torch.tensor(np.copy(x))[None].float(), pad=True)
+        f0, pd = mean(f0, 3), median(pd, 3)
+        f0[pd < 0.1] = 0
+        return self._resize_f0(f0[0].cpu().numpy(), p_len)
+    def get_f0_fcpe(self, x, p_len, legacy=False):
+        if not hasattr(self, "fcpe"):
+            self.fcpe = FCPE(
+                os.path.join(
+                    "models",
+                    ("fcpe_legacy" if legacy else "fcpe") + ".pt"
+                ),
+                hop_length=self.hop_length,
+                f0_min=self.f0_min,
+                f0_max=self.f0_max,
+                dtype=torch.float32,
+                device=self.device,
+                sample_rate=self.sample_rate,
+                threshold=0.03 if legacy else 0.006,
+                legacy=legacy
+            )
+        f0 = self.fcpe.compute_f0(x, p_len)
+        return f0
+    def get_f0_rmvpe(self, x, p_len, legacy=False):
+        if not hasattr(self, "rmvpe"):
+            self.rmvpe = RMVPE(
+                os.path.join(
+                    "models",
+                    "rmvpe.pt"
+                ),
+                is_half=self.is_half,
+                device=self.device,
+            )
+        f0 = self.rmvpe.infer_from_audio_with_pitch(x, thred=0.03, f0_min=self.f0_min, f0_max=self.f0_max) if legacy else self.rmvpe.infer_from_audio(x, thred=0.03)
+        return self._resize_f0(f0, p_len)
+    def get_f0_pyworld(self, x, p_len, filter_radius, model="harvest"):
+        if not hasattr(self, "pw"): self.pw = PYWORLD()
+        x = x.astype(np.double)
+        pw = self.pw.harvest if model == "harvest" else self.pw.dio
+        f0, t = pw(
+            x,
+            fs=self.sample_rate,
+            f0_ceil=self.f0_max,
+            f0_floor=self.f0_min,
+            frame_period=1000 * self.window / self.sample_rate
+        )
+        f0 = self.pw.stonemask(
+            x,
+            self.sample_rate,
+            t,
+            f0
+        )
+        if filter_radius > 2 and model == "harvest": f0 = medfilt(f0, filter_radius)
+        elif model == "dio":
+            for index, pitch in enumerate(f0):
+                f0[index] = round(pitch, 1)
+        return self._resize_f0(f0, p_len)
+    def get_f0_swipe(self, x, p_len):
+        f0, t = swipe(
+            x.astype(np.float32),
+            self.sample_rate,
+            f0_floor=self.f0_min,
+            f0_ceil=self.f0_max,
+            frame_period=1000 * self.window / self.sample_rate
+        )
+        return self._resize_f0(
+            stonemask(
+                x,
+                self.sample_rate,
+                t,
+                f0
+            ),
+            p_len
+        )
+    def get_f0_yin(self, x, p_len, mode="yin"):
+        self.if_yin = mode == "yin"
+        self.yin = yin if self.if_yin else pyin
+        f0 = self.yin(
+            x.astype(np.float32),
+            sr=self.sample_rate,
+            fmin=self.f0_min,
+            fmax=self.f0_max,
+            hop_length=self.hop_length
+        )
+        if not self.if_yin: f0 = f0[0]
+        return self._resize_f0(f0, p_len)

RVC/modules/hifigan.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import sys
+import torch
+import torch.nn.functional as F
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+sys.path.append(os.getcwd())
+from modules.commons import init_weights
+from modules.residuals import ResBlock, LRELU_SLOPE
+class HiFiGANGenerator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+        super(HiFiGANGenerator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = torch.nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        self.ups_and_resblocks = torch.nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups_and_resblocks.append(weight_norm(torch.nn.ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.ups_and_resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups_and_resblocks.apply(init_weights)
+        if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+        def forward(self, x, g = None):
+            x = self.conv_pre(x)
+            if g is not None: x = x + self.cond(g)
+            resblock_idx = 0
+            for _ in range(self.num_upsamples):
+                x = self.ups_and_resblocks[resblock_idx](F.leaky_relu(x, LRELU_SLOPE))
+                resblock_idx += 1
+                xs = 0
+                for _ in range(self.num_kernels):
+                    xs += self.ups_and_resblocks[resblock_idx](x)
+                    resblock_idx += 1
+                x = xs / self.num_kernels
+            return torch.tanh(self.conv_post(F.leaky_relu(x)))
+    def __prepare_scriptable__(self):
+        for l in self.ups_and_resblocks:
+            for hook in l._forward_pre_hooks.values():
+                if (hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" and hook.__class__.__name__ == "WeightNorm"): torch.nn.utils.remove_weight_norm(l)
+        return self
+    def remove_weight_norm(self):
+        for l in self.ups_and_resblocks:
+            remove_weight_norm(l)

RVC/modules/mediafire.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+import sys
+import requests
+from bs4 import BeautifulSoup
+def Mediafire_Download(url, output=None, filename=None):
+    if not filename: filename = url.split('/')[-2]
+    if not output: output = os.path.dirname(os.path.realpath(__file__))
+    output_file = os.path.join(output, filename)
+    sess = requests.session()
+    sess.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"})
+    try:
+        with requests.get(BeautifulSoup(sess.get(url).content, "html.parser").find(id="downloadButton").get("href"), stream=True) as r:
+            r.raise_for_status()
+            with open(output_file, "wb") as f:
+                total_length = int(r.headers.get('content-length'))
+                download_progress = 0
+                for chunk in r.iter_content(chunk_size=1024):
+                    download_progress += len(chunk)
+                    f.write(chunk)
+                    sys.stdout.write(f"\r[{filename}]: {int(100 * download_progress/total_length)}% ({round(download_progress/1024/1024, 2)}mb/{round(total_length/1024/1024, 2)}mb)")
+                    sys.stdout.flush()
+        sys.stdout.write("\n")
+        return output_file
+    except Exception as e:
+        raise RuntimeError(e)

RVC/modules/meganz.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+import re
+import json
+import codecs
+import random
+import base64
+import struct
+import shutil
+import requests
+import tempfile
+from Crypto.Cipher import AES
+from Crypto.Util import Counter
+def makebyte(x):
+    return codecs.latin_1_encode(x)[0]
+def a32_to_str(a):
+    return struct.pack('>%dI' % len(a), *a)
+def get_chunks(size):
+    p, s = 0, 0x20000
+    while p + s < size:
+        yield(p, s)
+        p += s
+        if s < 0x100000: s += 0x20000
+    yield(p, size - p)
+def aes_cbc_decrypt(data, key):
+    aes_cipher = AES.new(key, AES.MODE_CBC, makebyte('\0' * 16))
+    return aes_cipher.decrypt(data)
+def decrypt_attr(attr, key):
+    attr = codecs.latin_1_decode(aes_cbc_decrypt(attr, a32_to_str(key)))[0].rstrip('\0')
+    return json.loads(attr[4:]) if attr[:6] == 'MEGA{"' else False
+def _api_request(data):
+    sequence_num = random.randint(0, 0xFFFFFFFF)
+    params = {'id': sequence_num}
+    sequence_num += 1
+    if not isinstance(data, list): data = [data]
+    json_resp = json.loads(requests.post('{0}://g.api.{1}/cs'.format('https', 'mega.co.nz'), params=params, data=json.dumps(data), timeout=160).text)
+    if isinstance(json_resp, int): raise Exception(json_resp)
+    return json_resp[0]
+def base64_url_decode(data):
+    data += '=='[(2 - len(data) * 3) % 4:]
+    for search, replace in (('-', '+'), ('_', '/'), (',', '')):
+        data = data.replace(search, replace)
+    return base64.b64decode(data)
+def str_to_a32(b):
+    if isinstance(b, str): b = makebyte(b)
+    if len(b) % 4: b += b'\0' * (4 - len(b) % 4)
+    return struct.unpack('>%dI' % (len(b) / 4), b)
+def base64_to_a32(s):
+    return str_to_a32(base64_url_decode(s))
+def mega_download_file(file_handle, file_key, dest_path=None):
+    file_key = base64_to_a32(file_key)
+    file_data = _api_request({'a': 'g', 'g': 1, 'p': file_handle})
+    k = (file_key[0] ^ file_key[4], file_key[1] ^ file_key[5], file_key[2] ^ file_key[6], file_key[3] ^ file_key[7])
+    iv = file_key[4:6] + (0, 0)
+    if 'g' not in file_data: raise Exception
+    file_size = file_data['s']
+    attribs = decrypt_attr(base64_url_decode(file_data['at']), k)
+    input_file = requests.get(file_data['g'], stream=True).raw
+    temp_output_file = tempfile.NamedTemporaryFile(mode='w+b', prefix='megapy_', delete=False)
+    k_str = a32_to_str(k)
+    aes = AES.new(k_str, AES.MODE_CTR, counter=Counter.new(128, initial_value=((iv[0] << 32) + iv[1]) << 64))
+    mac_str = b'\0' * 16
+    mac_encryptor = AES.new(k_str, AES.MODE_CBC, mac_str)
+    iv_str = a32_to_str([iv[0], iv[1], iv[0], iv[1]])
+    for _, chunk_size in get_chunks(file_size):
+        chunk = aes.decrypt(input_file.read(chunk_size))
+        temp_output_file.write(chunk)
+        encryptor = AES.new(k_str, AES.MODE_CBC, iv_str)
+        for i in range(0, len(chunk) - 16, 16):
+            block = chunk[i:i + 16]
+            encryptor.encrypt(block)
+        i = (i + 16) if file_size > 16 else 0
+        block = chunk[i:i + 16]
+        if len(block) % 16: block += b'\0' * (16 - (len(block) % 16))
+        mac_str = mac_encryptor.encrypt(encryptor.encrypt(block))
+    file_mac = str_to_a32(mac_str)
+    temp_output_file.close()
+    if (file_mac[0] ^ file_mac[1], file_mac[2] ^ file_mac[3]) != file_key[6:8]: raise ValueError
+    file_path = os.path.join(dest_path, attribs['n'])
+    if os.path.exists(file_path): os.remove(file_path)
+    shutil.move(temp_output_file.name, file_path)
+def mega_download_url(url, dest_path=None):
+    if '/file/' in url:
+        url = url.replace(' ', '')
+        file_id = re.findall(r'\W\w\w\w\w\w\w\w\w\W', url)[0][1:-1]
+        path = f'{file_id}!{url[re.search(file_id, url).end() + 1:]}'.split('!')
+    elif '!' in url: path = re.findall(r'/#!(.*)', url)[0].split('!')
+    else: raise Exception
+    return mega_download_file(path[0], path[1], dest_path)

RVC/modules/modules.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import sys
+import torch
+sys.path.append(os.getcwd())
+from .commons import fused_add_tanh_sigmoid_multiply
+class WaveNet(torch.nn.Module):
+    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+        super(WaveNet, self).__init__()
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = torch.nn.Dropout(p_dropout)
+        if gin_channels != 0: self.cond_layer = torch.nn.utils.parametrizations.weight_norm(torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1), name="weight")
+        dilations = [dilation_rate ** i for i in range(n_layers)]
+        paddings = [(kernel_size * d - d) // 2 for d in dilations]
+        for i in range(n_layers):
+            in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilations[i], padding=paddings[i])
+            in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+            res_skip_channels = (hidden_channels if i == n_layers - 1 else 2 * hidden_channels)
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(self, x, x_mask, g=None):
+        output = x.clone().zero_()
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+        if g is not None: g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            g_l = (g[:, i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels, :] if g is not None else 0)
+            res_skip_acts = self.res_skip_layers[i](self.drop(fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)))
+            if i < self.n_layers - 1:
+                x = (x + (res_skip_acts[:, : self.hidden_channels, :])) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels :, :]
+            else: output = output + res_skip_acts
+        return output * x_mask
+    def remove_weight_norm(self):
+        if self.gin_channels != 0: torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)

RVC/modules/mrf_hifigan.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import math
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import remove_weight_norm
+from torch.utils.checkpoint import checkpoint
+from torch.nn.utils.parametrizations import weight_norm
+LRELU_SLOPE = 0.1
+class MRFLayer(nn.Module):
+    def __init__(self, channels, kernel_size, dilation):
+        super().__init__()
+        self.conv1 = weight_norm(nn.Conv1d(channels, channels, kernel_size, padding=(kernel_size * dilation - dilation) // 2, dilation=dilation))
+        self.conv2 = weight_norm(nn.Conv1d(channels, channels, kernel_size, padding=kernel_size // 2, dilation=1))
+    def forward(self, x):
+        return x + self.conv2(F.leaky_relu(self.conv1(F.leaky_relu(x, LRELU_SLOPE)), LRELU_SLOPE))
+    def remove_weight_norm(self):
+        remove_weight_norm(self.conv1)
+        remove_weight_norm(self.conv2)
+class MRFBlock(nn.Module):
+    def __init__(self, channels, kernel_size, dilations):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for dilation in dilations:
+            self.layers.append(MRFLayer(channels, kernel_size, dilation))
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+    def remove_weight_norm(self):
+        for layer in self.layers:
+            layer.remove_weight_norm()
+class SineGenerator(nn.Module):
+    def __init__(self, samp_rate, harmonic_num = 0, sine_amp = 0.1, noise_std = 0.003, voiced_threshold = 0):
+        super(SineGenerator, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        return torch.ones_like(f0) * (f0 > self.voiced_threshold)
+    def _f02sine(self, f0_values):
+        rad_values = (f0_values / self.sampling_rate) % 1
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], dtype=f0_values.dtype, device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        tmp_over_one = torch.cumsum(rad_values, 1) % 1
+        tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+        cumsum_shift = torch.zeros_like(rad_values)
+        cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+        return torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
+    def forward(self, f0):
+        with torch.no_grad():
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, dtype=f0.dtype, device=f0.device)
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+            sine_waves = self._f02sine(f0_buf) * self.sine_amp
+            uv = self._f02uv(f0)
+            sine_waves = sine_waves * uv + ((uv * self.noise_std + (1 - uv) * self.sine_amp / 3) * torch.randn_like(sine_waves))
+        return sine_waves
+class SourceModuleHnNSF(nn.Module):
+    def __init__(self, sampling_rate, harmonic_num = 0, sine_amp = 0.1, add_noise_std = 0.003, voiced_threshold = 0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.l_sin_gen = SineGenerator(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold)
+        self.l_linear = nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = nn.Tanh()
+    def forward(self, x):
+        return self.l_tanh(self.l_linear(self.l_sin_gen(x).to(dtype=self.l_linear.weight.dtype)))
+class HiFiGANMRFGenerator(nn.Module):
+    def __init__(self, in_channel, upsample_initial_channel, upsample_rates, upsample_kernel_sizes, resblock_kernel_sizes, resblock_dilations, gin_channels, sample_rate, harmonic_num, checkpointing = False):
+        super().__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.checkpointing = checkpointing
+        self.f0_upsample = nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num)
+        self.conv_pre = weight_norm(nn.Conv1d(in_channel, upsample_initial_channel, kernel_size=7, stride=1, padding=3))
+        self.upsamples = nn.ModuleList()
+        self.noise_convs = nn.ModuleList()
+        stride_f0s = [math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 for i in range(len(upsample_rates))]
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.upsamples.append(weight_norm(nn.ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), kernel_size=k, stride=u, padding=((k - u) // 2) if u % 2 == 0 else (u // 2 + u % 2), output_padding=u % 2)))
+            stride = stride_f0s[i]
+            kernel = 1 if stride == 1 else stride * 2 - stride % 2
+            self.noise_convs.append(nn.Conv1d(1, upsample_initial_channel // (2 ** (i + 1)), kernel_size=kernel, stride=stride, padding=0 if stride == 1 else (kernel - stride) // 2))
+        self.mrfs = nn.ModuleList()
+        for i in range(len(self.upsamples)):
+            channel = upsample_initial_channel // (2 ** (i + 1))
+            self.mrfs.append(nn.ModuleList([MRFBlock(channel, kernel_size=k, dilations=d) for k, d in zip(resblock_kernel_sizes, resblock_dilations)]))
+        self.conv_post = weight_norm(nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3))
+        if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, f0, g = None):
+        har_source = self.m_source(self.f0_upsample(f0[:, None, :]).transpose(-1, -2)).transpose(-1, -2)
+        x = self.conv_pre(x)
+        if g is not None: x += self.cond(g)
+        for ups, mrf, noise_conv in zip(self.upsamples, self.mrfs, self.noise_convs):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            if self.training and self.checkpointing:
+                x = checkpoint(ups, x, use_reentrant=False) + noise_conv(har_source)
+                xs = sum([checkpoint(layer, x, use_reentrant=False) for layer in mrf])
+            else:
+                x = ups(x) + noise_conv(har_source)
+                xs = sum([layer(x) for layer in mrf])
+            x = xs / self.num_kernels
+        return torch.tanh(self.conv_post(F.leaky_relu(x)))
+    def remove_weight_norm(self):
+        remove_weight_norm(self.conv_pre)
+        for up in self.upsamples:
+            remove_weight_norm(up)
+        for mrf in self.mrfs:
+            mrf.remove_weight_norm()
+        remove_weight_norm(self.conv_post)

RVC/modules/noisereduce.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import torch
+import tempfile
+import numpy as np
+from joblib import Parallel, delayed
+from torch.nn.functional import conv1d, conv2d
+@torch.no_grad()
+def amp_to_db(x, eps = torch.finfo(torch.float32).eps, top_db = 40):
+    x_db = 20 * torch.log10(x.abs() + eps)
+    return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1))
+@torch.no_grad()
+def temperature_sigmoid(x, x0, temp_coeff):
+    return torch.sigmoid((x - x0) / temp_coeff)
+@torch.no_grad()
+def linspace(start, stop, num = 50, endpoint = True, **kwargs):
+    return torch.linspace(start, stop, num, **kwargs) if endpoint else torch.linspace(start, stop, num + 1, **kwargs)[:-1]
+def _smoothing_filter(n_grad_freq, n_grad_time):
+    smoothing_filter = np.outer(np.concatenate([np.linspace(0, 1, n_grad_freq + 1, endpoint=False), np.linspace(1, 0, n_grad_freq + 2)])[1:-1], np.concatenate([np.linspace(0, 1, n_grad_time + 1, endpoint=False), np.linspace(1, 0, n_grad_time + 2)])[1:-1])
+    return smoothing_filter / np.sum(smoothing_filter)
+class SpectralGate:
+    def __init__(self, y, sr, prop_decrease, chunk_size, padding, n_fft, win_length, hop_length, time_constant_s, freq_mask_smooth_hz, time_mask_smooth_ms, tmp_folder, use_tqdm, n_jobs):
+        self.sr = sr
+        self.flat = False
+        y = np.array(y)
+        if len(y.shape) == 1:
+            self.y = np.expand_dims(y, 0)
+            self.flat = True
+        elif len(y.shape) > 2: raise ValueError
+        else: self.y = y
+        self._dtype = y.dtype
+        self.n_channels, self.n_frames = self.y.shape
+        self._chunk_size = chunk_size
+        self.padding = padding
+        self.n_jobs = n_jobs
+        self.use_tqdm = use_tqdm
+        self._tmp_folder = tmp_folder
+        self._n_fft = n_fft
+        self._win_length = self._n_fft if win_length is None else win_length
+        self._hop_length = (self._win_length // 4) if hop_length is None else hop_length
+        self._time_constant_s = time_constant_s
+        self._prop_decrease = prop_decrease
+        if (freq_mask_smooth_hz is None) & (time_mask_smooth_ms is None): self.smooth_mask = False
+        else: self._generate_mask_smoothing_filter(freq_mask_smooth_hz, time_mask_smooth_ms)
+    def _generate_mask_smoothing_filter(self, freq_mask_smooth_hz, time_mask_smooth_ms):
+        if freq_mask_smooth_hz is None: n_grad_freq = 1
+        else:
+            n_grad_freq = int(freq_mask_smooth_hz / (self.sr / (self._n_fft / 2)))
+            if n_grad_freq < 1: raise ValueError
+        if time_mask_smooth_ms is None: n_grad_time = 1
+        else:
+            n_grad_time = int(time_mask_smooth_ms / ((self._hop_length / self.sr) * 1000))
+            if n_grad_time < 1: raise ValueError
+        if (n_grad_time == 1) & (n_grad_freq == 1): self.smooth_mask = False
+        else:
+            self.smooth_mask = True
+            self._smoothing_filter = _smoothing_filter(n_grad_freq, n_grad_time)
+    def _read_chunk(self, i1, i2):
+        i1b = 0 if i1 < 0 else i1
+        i2b = self.n_frames if i2 > self.n_frames else i2
+        chunk = np.zeros((self.n_channels, i2 - i1))
+        chunk[:, i1b - i1: i2b - i1] = self.y[:, i1b:i2b]
+        return chunk
+    def filter_chunk(self, start_frame, end_frame):
+        i1 = start_frame - self.padding
+        return self._do_filter(self._read_chunk(i1, (end_frame + self.padding)))[:, start_frame - i1: end_frame - i1]
+    def _get_filtered_chunk(self, ind):
+        start0 = ind * self._chunk_size
+        end0 = (ind + 1) * self._chunk_size
+        return self.filter_chunk(start_frame=start0, end_frame=end0)
+    def _do_filter(self, chunk):
+        pass
+    def _iterate_chunk(self, filtered_chunk, pos, end0, start0, ich):
+        filtered_chunk[:, pos: pos + end0 - start0] = self._get_filtered_chunk(ich)[:, start0:end0]
+        pos += end0 - start0
+    def get_traces(self, start_frame=None, end_frame=None):
+        if start_frame is None: start_frame = 0
+        if end_frame is None: end_frame = self.n_frames
+        if self._chunk_size is not None:
+            if end_frame - start_frame > self._chunk_size:
+                ich1 = int(start_frame / self._chunk_size)
+                ich2 = int((end_frame - 1) / self._chunk_size)
+                with tempfile.NamedTemporaryFile(prefix=self._tmp_folder) as fp:
+                    filtered_chunk = np.memmap(fp, dtype=self._dtype, shape=(self.n_channels, int(end_frame - start_frame)), mode="w+")
+                    pos_list, start_list, end_list = [], [], []
+                    pos = 0
+                    for ich in range(ich1, ich2 + 1):
+                        start0 = (start_frame - ich * self._chunk_size) if ich == ich1 else 0
+                        end0 = end_frame - ich * self._chunk_size if ich == ich2 else self._chunk_size
+                        pos_list.append(pos)
+                        start_list.append(start0)
+                        end_list.append(end0)
+                        pos += end0 - start0
+                    Parallel(n_jobs=self.n_jobs)(delayed(self._iterate_chunk)(filtered_chunk, pos, end0, start0, ich) for pos, start0, end0, ich in zip(pos_list, start_list, end_list, range(ich1, ich2 + 1)))
+                    return filtered_chunk.astype(self._dtype).flatten() if self.flat else filtered_chunk.astype(self._dtype)
+        filtered_chunk = self.filter_chunk(start_frame=0, end_frame=end_frame)
+        return filtered_chunk.astype(self._dtype).flatten() if self.flat else filtered_chunk.astype(self._dtype)
+class TG(torch.nn.Module):
+    @torch.no_grad()
+    def __init__(self, sr, nonstationary = False, n_std_thresh_stationary = 1.5, n_thresh_nonstationary = 1.3, temp_coeff_nonstationary = 0.1, n_movemean_nonstationary = 20, prop_decrease = 1.0, n_fft = 1024, win_length = None, hop_length = None, freq_mask_smooth_hz = 500, time_mask_smooth_ms = 50):
+        super().__init__()
+        self.sr = sr
+        self.nonstationary = nonstationary
+        assert 0.0 <= prop_decrease <= 1.0
+        self.prop_decrease = prop_decrease
+        self.n_fft = n_fft
+        self.win_length = self.n_fft if win_length is None else win_length
+        self.hop_length = self.win_length // 4 if hop_length is None else hop_length
+        self.n_std_thresh_stationary = n_std_thresh_stationary
+        self.temp_coeff_nonstationary = temp_coeff_nonstationary
+        self.n_movemean_nonstationary = n_movemean_nonstationary
+        self.n_thresh_nonstationary = n_thresh_nonstationary
+        self.freq_mask_smooth_hz = freq_mask_smooth_hz
+        self.time_mask_smooth_ms = time_mask_smooth_ms
+        self.register_buffer("smoothing_filter", self._generate_mask_smoothing_filter())
+    @torch.no_grad()
+    def _generate_mask_smoothing_filter(self):
+        if self.freq_mask_smooth_hz is None and self.time_mask_smooth_ms is None: return None
+        n_grad_freq = (1 if self.freq_mask_smooth_hz is None else int(self.freq_mask_smooth_hz / (self.sr / (self.n_fft / 2))))
+        if n_grad_freq < 1: raise ValueError
+        n_grad_time = (1 if self.time_mask_smooth_ms is None else int(self.time_mask_smooth_ms / ((self.hop_length / self.sr) * 1000)))
+        if n_grad_time < 1: raise ValueError
+        if n_grad_time == 1 and n_grad_freq == 1: return None
+        smoothing_filter = torch.outer(torch.cat([linspace(0, 1, n_grad_freq + 1, endpoint=False), linspace(1, 0, n_grad_freq + 2)])[1:-1], torch.cat([linspace(0, 1, n_grad_time + 1, endpoint=False), linspace(1, 0, n_grad_time + 2)])[1:-1]).unsqueeze(0).unsqueeze(0)
+        return smoothing_filter / smoothing_filter.sum()
+    @torch.no_grad()
+    def _stationary_mask(self, X_db, xn = None):
+        XN_db = amp_to_db(torch.stft(xn, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, return_complex=True, pad_mode="constant", center=True, window=torch.hann_window(self.win_length).to(xn.device))).to(dtype=X_db.dtype) if xn is not None else X_db
+        std_freq_noise, mean_freq_noise = torch.std_mean(XN_db, dim=-1)
+        return torch.gt(X_db, (mean_freq_noise + std_freq_noise * self.n_std_thresh_stationary).unsqueeze(2))
+    @torch.no_grad()
+    def _nonstationary_mask(self, X_abs):
+        X_smoothed = (conv1d(X_abs.reshape(-1, 1, X_abs.shape[-1]), torch.ones(self.n_movemean_nonstationary, dtype=X_abs.dtype, device=X_abs.device).view(1, 1, -1), padding="same").view(X_abs.shape) / self.n_movemean_nonstationary)
+        return temperature_sigmoid(((X_abs - X_smoothed) / X_smoothed), self.n_thresh_nonstationary, self.temp_coeff_nonstationary)
+    def forward(self, x, xn = None):
+        assert x.ndim == 2
+        if x.shape[-1] < self.win_length * 2: raise Exception
+        assert xn is None or xn.ndim == 1 or xn.ndim == 2
+        if xn is not None and xn.shape[-1] < self.win_length * 2: raise Exception
+        X = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, return_complex=True, pad_mode="constant", center=True, window=torch.hann_window(self.win_length).to(x.device))
+        sig_mask = self._nonstationary_mask(X.abs()) if self.nonstationary else self._stationary_mask(amp_to_db(X), xn)
+        sig_mask = self.prop_decrease * (sig_mask * 1.0 - 1.0) + 1.0
+        if self.smoothing_filter is not None: sig_mask = conv2d(sig_mask.unsqueeze(1), self.smoothing_filter.to(sig_mask.dtype), padding="same")
+        Y = X * sig_mask.squeeze(1)
+        return torch.istft(Y, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, center=True, window=torch.hann_window(self.win_length).to(Y.device)).to(dtype=x.dtype)
+class StreamedTorchGate(SpectralGate):
+    def __init__(self, y, sr, stationary=False, y_noise=None, prop_decrease=1.0, time_constant_s=2.0, freq_mask_smooth_hz=500, time_mask_smooth_ms=50, thresh_n_mult_nonstationary=2, sigmoid_slope_nonstationary=10, n_std_thresh_stationary=1.5, tmp_folder=None, chunk_size=600000, padding=30000, n_fft=1024, win_length=None, hop_length=None, clip_noise_stationary=True, use_tqdm=False, n_jobs=1, device="cpu"):
+        super().__init__(y=y, sr=sr, chunk_size=chunk_size, padding=padding, n_fft=n_fft, win_length=win_length, hop_length=hop_length, time_constant_s=time_constant_s, freq_mask_smooth_hz=freq_mask_smooth_hz, time_mask_smooth_ms=time_mask_smooth_ms, tmp_folder=tmp_folder, prop_decrease=prop_decrease, use_tqdm=use_tqdm, n_jobs=n_jobs)
+        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
+        if y_noise is not None:
+            if y_noise.shape[-1] > y.shape[-1] and clip_noise_stationary: y_noise = y_noise[: y.shape[-1]]
+            y_noise = torch.from_numpy(y_noise).to(device)
+            if len(y_noise.shape) == 1: y_noise = y_noise.unsqueeze(0)
+        self.y_noise = y_noise
+        self.tg = TG(sr=sr, nonstationary=not stationary, n_std_thresh_stationary=n_std_thresh_stationary, n_thresh_nonstationary=thresh_n_mult_nonstationary, temp_coeff_nonstationary=1 / sigmoid_slope_nonstationary, n_movemean_nonstationary=int(time_constant_s / self._hop_length * sr), prop_decrease=prop_decrease, n_fft=self._n_fft, win_length=self._win_length, hop_length=self._hop_length, freq_mask_smooth_hz=freq_mask_smooth_hz, time_mask_smooth_ms=time_mask_smooth_ms).to(device)
+    def _do_filter(self, chunk):
+        if type(chunk) is np.ndarray: chunk = torch.from_numpy(chunk).to(self.device)
+        return self.tg(x=chunk, xn=self.y_noise).cpu().detach().numpy()
+def reduce_noise(y, sr, stationary=False, y_noise=None, prop_decrease=1.0, time_constant_s=2.0, freq_mask_smooth_hz=500, time_mask_smooth_ms=50, thresh_n_mult_nonstationary=2, sigmoid_slope_nonstationary=10, tmp_folder=None, chunk_size=600000, padding=30000, n_fft=1024, win_length=None, hop_length=None, clip_noise_stationary=True, use_tqdm=False, device="cpu"):
+    return StreamedTorchGate(y=y, sr=sr, stationary=stationary, y_noise=y_noise, prop_decrease=prop_decrease, time_constant_s=time_constant_s, freq_mask_smooth_hz=freq_mask_smooth_hz, time_mask_smooth_ms=time_mask_smooth_ms, thresh_n_mult_nonstationary=thresh_n_mult_nonstationary, sigmoid_slope_nonstationary=sigmoid_slope_nonstationary, tmp_folder=tmp_folder, chunk_size=chunk_size, padding=padding, n_fft=n_fft, win_length=win_length, hop_length=hop_length, clip_noise_stationary=clip_noise_stationary, use_tqdm=use_tqdm, n_jobs=1, device=device).get_traces()

RVC/modules/normalization.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+import torch.nn.functional as F
+class LayerNorm(torch.nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = torch.nn.Parameter(torch.ones(channels))
+        self.beta = torch.nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        return F.layer_norm(x, (x.size(-1),), self.gamma, self.beta, self.eps).transpose(1, -1)

RVC/modules/nsf_hifigan.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import sys
+import math
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.nn.utils import remove_weight_norm
+from torch.utils.checkpoint import checkpoint
+from torch.nn.utils.parametrizations import weight_norm
+sys.path.append(os.getcwd())
+from modules.commons import init_weights
+from modules.residuals import ResBlock, LRELU_SLOPE
+class SineGen(torch.nn.Module):
+    def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0, flag_for_pulse=False):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        return torch.ones_like(f0) * (f0 > self.voiced_threshold)
+    def _f02sine(self, f0, upp):
+        rad = f0 / self.sampling_rate * torch.arange(1, upp + 1, dtype=f0.dtype, device=f0.device)
+        rad += F.pad((torch.fmod(rad[:, :-1, -1:].float() + 0.5, 1.0) - 0.5).cumsum(dim=1).fmod(1.0).to(f0), (0, 0, 1, 0), mode='constant')
+        rad = rad.reshape(f0.shape[0], -1, 1)
+        rad *= torch.arange(1, self.dim + 1, dtype=f0.dtype, device=f0.device).reshape(1, 1, -1)
+        rand_ini = torch.rand(1, 1, self.dim, device=f0.device)
+        rand_ini[..., 0] = 0
+        rad += rand_ini
+        return torch.sin(2 * np.pi * rad)
+    def forward(self, f0, upp):
+        with torch.no_grad():
+            f0 = f0.unsqueeze(-1)
+            sine_waves = self._f02sine(f0, upp) * self.sine_amp
+            uv = F.interpolate(self._f02uv(f0).transpose(2, 1), scale_factor=float(upp), mode="nearest").transpose(2, 1)
+            sine_waves = sine_waves * uv + ((uv * self.noise_std + (1 - uv) * self.sine_amp / 3) * torch.randn_like(sine_waves))
+        return sine_waves
+class SourceModuleHnNSF(torch.nn.Module):
+    def __init__(self, sample_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.l_sin_gen = SineGen(sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x, upsample_factor = 1):
+        return self.l_tanh(self.l_linear(self.l_sin_gen(x, upsample_factor).to(dtype=self.l_linear.weight.dtype)))
+class HiFiGANNRFGenerator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, sr, checkpointing = False):
+        super(HiFiGANNRFGenerator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.upp = math.prod(upsample_rates)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=self.upp)
+        self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0)
+        self.conv_pre = torch.nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        self.checkpointing = checkpointing
+        self.ups = torch.nn.ModuleList()
+        self.noise_convs = torch.nn.ModuleList()
+        channels = [upsample_initial_channel // (2 ** (i + 1)) for i in range(self.num_upsamples)]
+        stride_f0s = [math.prod(upsample_rates[i + 1 :]) if i + 1 < self.num_upsamples else 1 for i in range(self.num_upsamples)]
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(torch.nn.ConvTranspose1d(upsample_initial_channel // (2**i), channels[i], k, u, padding=((k - u) // 2) if u % 2 == 0 else (u // 2 + u % 2), output_padding=u % 2)))
+            stride = stride_f0s[i]
+            kernel = 1 if stride == 1 else stride * 2 - stride % 2
+            self.noise_convs.append(torch.nn.Conv1d(1, channels[i], kernel_size=kernel, stride=stride, padding=0 if stride == 1 else (kernel - stride) // 2))
+        self.resblocks = torch.nn.ModuleList([ResBlock(channels[i], k, d) for i in range(len(self.ups)) for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)])
+        self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, f0, g = None):
+        har_source = self.m_source(f0, self.upp).transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None: x += self.cond(g)
+        for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            if self.training and self.checkpointing:
+                x = checkpoint(ups, x, use_reentrant=False) + noise_convs(har_source)
+                xs = sum([checkpoint(resblock, x, use_reentrant=False) for j, resblock in enumerate(self.resblocks) if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)])
+            else:
+                x = ups(x) + noise_convs(har_source)
+                xs = sum([resblock(x) for j, resblock in enumerate(self.resblocks) if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)])
+            x = xs / self.num_kernels
+        return torch.tanh(self.conv_post(F.leaky_relu(x)))
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()

RVC/modules/opencl.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import torch
+import platform
+import subprocess
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from librosa.util import pad_center
+from scipy.signal import get_window
+try:
+    import pytorch_ocl
+except:
+    pytorch_ocl = None
+torch_available = pytorch_ocl != None
+def get_amd_gpu_windows():
+    try:
+        return [gpu.strip() for gpu in subprocess.check_output("wmic path win32_VideoController get name", shell=True).decode().split('\n')[1:] if 'AMD' in gpu or 'Radeon' in gpu or 'Vega' in gpu]
+    except:
+        return []
+def get_amd_gpu_linux():
+    try:
+        return [gpu for gpu in subprocess.check_output("lspci | grep VGA", shell=True).decode().split('\n') if 'AMD' in gpu or 'Radeon' in gpu or 'Vega' in gpu]
+    except:
+        return []
+def get_gpu_list():
+    return (get_amd_gpu_windows() if platform.system() == "Windows" else get_amd_gpu_linux()) if torch_available else []
+def device_count():
+    return len(get_gpu_list()) if torch_available else 0
+def device_name(device_id = 0):
+    return (get_gpu_list()[device_id] if device_id >= 0 and device_id < device_count() else "") if torch_available else ""
+def is_available():
+    return (device_count() > 0) if torch_available else False
+class STFT(torch.nn.Module):
+    def __init__(self, filter_length=1024, hop_length=512, win_length=None, window="hann"):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.pad_amount = int(self.filter_length / 2)
+        self.win_length = win_length
+        self.hann_window = {}
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])])
+        forward_basis = torch.FloatTensor(fourier_basis)
+        inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis))
+        if win_length is None or not win_length: win_length = filter_length
+        assert filter_length >= win_length
+        fft_window = torch.from_numpy(pad_center(get_window(window, win_length, fftbins=True), size=filter_length)).float()
+        forward_basis *= fft_window
+        inverse_basis = (inverse_basis.T * fft_window).T
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+        self.register_buffer("fft_window", fft_window.float())
+    def transform(self, input_data, eps):
+        input_data = F.pad(input_data, (self.pad_amount, self.pad_amount), mode="reflect")
+        forward_transform = torch.matmul(self.forward_basis, input_data.unfold(1, self.filter_length, self.hop_length).permute(0, 2, 1))
+        cutoff = int(self.filter_length / 2 + 1)
+        return torch.sqrt(forward_transform[:, :cutoff, :]**2 + forward_transform[:, cutoff:, :]**2 + eps)
+class GRU(nn.RNNBase):
+    def __init__(self, input_size, hidden_size, num_layers=1, bias=True, batch_first=True, dropout=0.0, bidirectional=False, device=None, dtype=None):
+        super().__init__("GRU", input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional, device=device, dtype=dtype)
+    @staticmethod
+    def _gru_cell(x, hx, weight_ih, bias_ih, weight_hh, bias_hh):
+        gate_x = F.linear(x, weight_ih, bias_ih)
+        gate_h = F.linear(hx, weight_hh, bias_hh)
+        i_r, i_i, i_n = gate_x.chunk(3, 1)
+        h_r, h_i, h_n = gate_h.chunk(3, 1)
+        resetgate = torch.sigmoid(i_r + h_r)
+        inputgate = torch.sigmoid(i_i + h_i)
+        newgate = torch.tanh(i_n + resetgate * h_n)
+        hy = newgate + inputgate * (hx - newgate)
+        return hy
+    def _gru_layer(self, x, hx, weights):
+        weight_ih, weight_hh, bias_ih, bias_hh = weights
+        outputs = []
+        for x_t in x.unbind(1):
+            hx = self._gru_cell(x_t, hx, weight_ih, bias_ih, weight_hh, bias_hh)
+            outputs.append(hx)
+        return torch.stack(outputs, dim=1), hx
+    def _gru(self, x, hx):
+        if not self.batch_first: x = x.permute(1, 0, 2)
+        num_directions = 2 if self.bidirectional else 1
+        h_n = []
+        output_fwd, output_bwd = x, x
+        for layer in range(self.num_layers):
+            fwd_idx = layer * num_directions
+            bwd_idx = fwd_idx + 1 if self.bidirectional else None
+            weights_fwd = self._get_weights(fwd_idx)
+            h_fwd = hx[fwd_idx]
+            out_fwd, h_out_fwd = self._gru_layer(output_fwd, h_fwd, weights_fwd)
+            h_n.append(h_out_fwd)
+            if self.bidirectional:
+                weights_bwd = self._get_weights(bwd_idx)
+                h_bwd = hx[bwd_idx]
+                reversed_input = torch.flip(output_bwd, dims=[1])
+                out_bwd, h_out_bwd = self._gru_layer(reversed_input, h_bwd, weights_bwd)
+                out_bwd = torch.flip(out_bwd, dims=[1])
+                h_n.append(h_out_bwd)
+                output_fwd = torch.cat([out_fwd, out_bwd], dim=2)
+                output_bwd = output_fwd
+            else: output_fwd = out_fwd
+            if layer < self.num_layers - 1 and self.dropout > 0:
+                output_fwd = F.dropout(output_fwd, p=self.dropout, training=self.training)
+                if self.bidirectional: output_bwd = output_fwd
+        output = output_fwd
+        h_n = torch.stack(h_n, dim=0)
+        if not self.batch_first: output = output.permute(1, 0, 2)
+        return output, h_n
+    def _get_weights(self, layer_idx):
+        weights = self._all_weights[layer_idx]
+        weight_ih = getattr(self, weights[0])
+        weight_hh = getattr(self, weights[1])
+        bias_ih = getattr(self, weights[2]) if self.bias else None
+        bias_hh = getattr(self, weights[3]) if self.bias else None
+        return weight_ih, weight_hh, bias_ih, bias_hh
+    def forward(self, input, hx=None):
+        if input.dim() != 3: raise ValueError
+        batch_size = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        if hx is None: hx = torch.zeros(self.num_layers * num_directions, batch_size, self.hidden_size, dtype=input.dtype, device=input.device)
+        self.check_forward_args(input, hx, batch_sizes=None)
+        return self._gru(input, hx)
+def group_norm(x, num_groups, weight=None, bias=None, eps=1e-5):
+    N, C = x.shape[:2]
+    assert C % num_groups == 0
+    shape = (N, num_groups, C // num_groups) + x.shape[2:]
+    x_reshaped = x.view(shape)
+    dims = (2,) + tuple(range(3, x_reshaped.dim()))
+    mean = x_reshaped.mean(dim=dims, keepdim=True)
+    var = x_reshaped.var(dim=dims, keepdim=True, unbiased=False)
+    x_norm = (x_reshaped - mean) / torch.sqrt(var + eps)
+    x_norm = x_norm.view_as(x)
+    if weight is not None:
+        weight = weight.view(1, C, *([1] * (x.dim() - 2)))
+        x_norm = x_norm * weight
+    if bias is not None:
+        bias = bias.view(1, C, *([1] * (x.dim() - 2)))
+        x_norm = x_norm + bias
+    return x_norm
+def script(f, *_, **__):
+    f.graph = pytorch_ocl.torch._C.Graph()
+    return f
+if torch_available:
+    nn.GRU = GRU
+    F.group_norm = group_norm
+    torch.jit.script = script

RVC/modules/pipeline.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import os
+import sys
+import torch
+import faiss
+import numpy as np
+import torch.nn.functional as F
+from scipy import signal
+sys.path.append(os.getcwd())
+from modules.generator import Generator
+from modules.rms import RMSEnergyExtractor
+from modules.utils import change_rms, clear_gpu_cache
+bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
+class Pipeline:
+    def __init__(self, tgt_sr, config):
+        self.x_pad, self.x_query, self.x_center, self.x_max = config.device_config()
+        self.sample_rate = 16000
+        self.window = 160
+        self.t_pad = self.sample_rate * self.x_pad
+        self.t_pad_tgt = tgt_sr * self.x_pad
+        self.t_pad2 = self.t_pad * 2
+        self.t_query = self.sample_rate * self.x_query
+        self.t_center = self.sample_rate * self.x_center
+        self.t_max = self.sample_rate * self.x_max
+        self.time_step = self.window / self.sample_rate * 1000
+        self.f0_min = 50
+        self.f0_max = 1100
+        self.device = config.device
+        self.is_half = config.is_half
+    def voice_conversion(self, model, net_g, sid, audio0, pitch, pitchf, index, big_npy, index_rate, version, protect, energy):
+        feats = (torch.from_numpy(audio0).half() if self.is_half else torch.from_numpy(audio0).float())
+        pitch_guidance = pitch != None and pitchf != None
+        energy_use = energy != None
+        if feats.dim() == 2: feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        with torch.no_grad():
+            padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
+            logits = model.extract_features(**{"source": feats.to(self.device), "padding_mask": padding_mask, "output_layer": 9 if version == "v1" else 12})
+            feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
+            if protect < 0.5 and pitch_guidance: feats0 = feats.clone()
+            if (not isinstance(index, type(None)) and not isinstance(big_npy, type(None)) and index_rate != 0):
+                npy = feats[0].cpu().numpy()
+                if self.is_half: npy = npy.astype(np.float32)
+                score, ix = index.search(npy, k=8)
+                weight = np.square(1 / score)
+                npy = np.sum(big_npy[ix] * np.expand_dims(weight / weight.sum(axis=1, keepdims=True), axis=2), axis=1)
+                if self.is_half: npy = npy.astype(np.float16)
+                feats = (torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats)
+            feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+            if protect < 0.5 and pitch_guidance: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+            p_len = audio0.shape[0] // self.window
+            if feats.shape[1] < p_len:
+                p_len = feats.shape[1]
+                if pitch_guidance: pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
+                if energy_use: energy = energy[:, :p_len]
+            if protect < 0.5 and pitch_guidance:
+                pitchff = pitchf.clone()
+                pitchff[pitchf > 0] = 1
+                pitchff[pitchf < 1] = protect
+                pitchff = pitchff.unsqueeze(-1)
+                feats = (feats * pitchff + feats0 * (1 - pitchff)).to(feats0.dtype)
+            p_len = torch.tensor([p_len], device=self.device).long()
+            feats = feats.half() if self.is_half else feats.float()
+            if not pitch_guidance: pitch, pitchf = None, None
+            else: pitchf = pitchf.half() if self.is_half else pitchf.float()
+            if not energy_use: energy = None
+            else: energy = energy.half() if self.is_half else energy.float()
+            audio1 = (
+                (
+                    net_g.infer(
+                        feats,
+                        p_len,
+                        pitch,
+                        pitchf,
+                        sid,
+                        energy
+                    )[0][0, 0]
+                ).data.cpu().float().numpy()
+            )
+        del feats, p_len, net_g, model, padding_mask
+        clear_gpu_cache()
+        return audio1
+    def pipeline(
+        self,
+        model,
+        net_g,
+        sid,
+        audio,
+        f0_up_key,
+        f0_method,
+        file_index,
+        index_rate,
+        pitch_guidance,
+        filter_radius,
+        volume_envelope,
+        version,
+        protect,
+        hop_length,
+        energy_use=False,
+        f0_autotune=False,
+        f0_autotune_strength=False
+    ):
+        if file_index != "" and os.path.exists(file_index) and index_rate != 0:
+            try:
+                index = faiss.read_index(file_index)
+                big_npy = index.reconstruct_n(0, index.ntotal)
+            except Exception as e:
+                print(f"[ERROR] Error occurred while reading index file: {e}")
+                index = big_npy = None
+        else: index = big_npy = None
+        opt_ts, audio_opt = [], []
+        audio = signal.filtfilt(bh, ah, audio)
+        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
+        if audio_pad.shape[0] > self.t_max:
+            audio_sum = np.zeros_like(audio)
+            for i in range(self.window):
+                audio_sum += audio_pad[i : i - self.window]
+            for t in range(self.t_center, audio.shape[0], self.t_center):
+                opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min())[0][0])
+        s = 0
+        t = None
+        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
+        p_len = audio_pad.shape[0] // self.window
+        if pitch_guidance:
+            if not hasattr(self, "f0_generator"): self.f0_generator = Generator(self.sample_rate, hop_length, self.f0_min, self.f0_max, self.is_half, self.device)
+            pitch, pitchf = self.f0_generator.calculator(f0_method, audio_pad, f0_up_key, p_len, filter_radius, f0_autotune, f0_autotune_strength)
+            if self.device == "mps": pitchf = pitchf.astype(np.float32)
+            pitch, pitchf = torch.tensor(pitch[:p_len], device=self.device).unsqueeze(0).long(), torch.tensor(pitchf[:p_len], device=self.device).unsqueeze(0).float()
+        if energy_use:
+            if not hasattr(self, "rms_extract"): self.rms_extract = RMSEnergyExtractor(frame_length=2048, hop_length=self.window, center=True, pad_mode = "reflect").to(self.device).eval()
+            energy = self.rms_extract(torch.from_numpy(audio_pad).to(self.device).unsqueeze(0)).cpu().numpy()
+            if self.device == "mps": energy = energy.astype(np.float32)
+            energy = torch.tensor(energy[:p_len], device=self.device).unsqueeze(0).float()
+        for t in opt_ts:
+            t = t // self.window * self.window
+            audio_opt.append(
+                self.voice_conversion(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[s : t + self.t_pad2 + self.window],
+                    pitch[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None,
+                    pitchf[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                    energy[:, s // self.window : (t + self.t_pad2) // self.window] if energy_use else None
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+            s = t
+        audio_opt.append(
+            self.voice_conversion(
+                model,
+                net_g,
+                sid,
+                audio_pad[t:],
+                (pitch[:, t // self.window :] if t is not None else pitch) if pitch_guidance else None,
+                (pitchf[:, t // self.window :] if t is not None else pitchf) if pitch_guidance else None,
+                index,
+                big_npy,
+                index_rate,
+                version,
+                protect,
+                (energy[:, t // self.window :] if t is not None else energy) if energy_use else None
+            )[self.t_pad_tgt : -self.t_pad_tgt]
+        )
+        audio_opt = np.concatenate(audio_opt)
+        if volume_envelope != 1: audio_opt = change_rms(audio, self.sample_rate, audio_opt, self.sample_rate, volume_envelope)
+        audio_max = np.abs(audio_opt).max() / 0.99
+        if audio_max > 1: audio_opt /= audio_max
+        if pitch_guidance: del pitch, pitchf
+        del sid
+        clear_gpu_cache()
+        return audio_opt

RVC/modules/pixeldrain.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+import requests
+def pixeldrain(url, output_dir):
+    try:
+        response = requests.get(f"https://pixeldrain.com/api/file/{url.split('pixeldrain.com/u/')[1]}")
+        if response.status_code == 200:
+            file_path = os.path.join(output_dir, (response.headers.get("Content-Disposition").split("filename=")[-1].strip('";')))
+            with open(file_path, "wb") as newfile:
+                newfile.write(response.content)
+            return file_path
+        else: return None
+    except Exception as e:
+        raise RuntimeError(e)

RVC/modules/pyworld.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import pickle
+import ctypes
+import platform
+import numpy as np
+class DioOption(ctypes.Structure):
+    _fields_ = [("F0Floor", ctypes.c_double), ("F0Ceil", ctypes.c_double), ("ChannelsInOctave", ctypes.c_double), ("FramePeriod", ctypes.c_double), ("Speed", ctypes.c_int), ("AllowedRange", ctypes.c_double)]
+class HarvestOption(ctypes.Structure):
+    _fields_ = [("F0Floor", ctypes.c_double), ("F0Ceil", ctypes.c_double), ("FramePeriod", ctypes.c_double)]
+class PYWORLD:
+    def __init__(self):
+        self.world_path = os.path.join("models", "world")
+        os.makedirs(self.world_path, exist_ok=True)
+        model_type, suffix = (("world_64" if platform.architecture()[0] == "64bit" else "world_86"), ".dll") if platform.system() == "Windows" else ("world_linux", ".so")
+        self.world_file_path = os.path.join(self.world_path, f"{model_type}{suffix}")
+        if not os.path.exists(self.world_file_path):
+            with open(os.path.join("models", "world.bin"), "rb") as f:
+                model = pickle.load(f)
+            with open(self.world_file_path, "wb") as w:
+                w.write(model[model_type])
+        self.world_dll = ctypes.CDLL(self.world_file_path)
+    def harvest(self, x, fs, f0_floor=50, f0_ceil=1100, frame_period=10):
+        self.world_dll.Harvest.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.c_int, ctypes.POINTER(HarvestOption), ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double)]
+        self.world_dll.Harvest.restype = None
+        self.world_dll.InitializeHarvestOption.argtypes = [ctypes.POINTER(HarvestOption)]
+        self.world_dll.InitializeHarvestOption.restype = None
+        self.world_dll.GetSamplesForHarvest.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_double]
+        self.world_dll.GetSamplesForHarvest.restype = ctypes.c_int
+        option = HarvestOption()
+        self.world_dll.InitializeHarvestOption(ctypes.byref(option))
+        option.F0Floor = f0_floor
+        option.F0Ceil = f0_ceil
+        option.FramePeriod = frame_period
+        f0_length = self.world_dll.GetSamplesForHarvest(fs, len(x), option.FramePeriod)
+        f0 = (ctypes.c_double * f0_length)()
+        tpos = (ctypes.c_double * f0_length)()
+        self.world_dll.Harvest((ctypes.c_double * len(x))(*x), len(x), fs, ctypes.byref(option), tpos, f0)
+        return np.array(f0, dtype=np.float32), np.array(tpos, dtype=np.float32)
+    def dio(self, x, fs, f0_floor=50, f0_ceil=1100, channels_in_octave=2, frame_period=10, speed=1, allowed_range=0.1):
+        self.world_dll.Dio.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.c_int, ctypes.POINTER(DioOption), ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double)]
+        self.world_dll.Dio.restype = None
+        self.world_dll.InitializeDioOption.argtypes = [ctypes.POINTER(DioOption)]
+        self.world_dll.InitializeDioOption.restype = None
+        self.world_dll.GetSamplesForDIO.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_double]
+        self.world_dll.GetSamplesForDIO.restype = ctypes.c_int
+        option = DioOption()
+        self.world_dll.InitializeDioOption(ctypes.byref(option))
+        option.F0Floor = f0_floor
+        option.F0Ceil = f0_ceil
+        option.ChannelsInOctave = channels_in_octave
+        option.FramePeriod = frame_period
+        option.Speed = speed
+        option.AllowedRange = allowed_range
+        f0_length = self.world_dll.GetSamplesForDIO(fs, len(x), option.FramePeriod)
+        f0 = (ctypes.c_double * f0_length)()
+        tpos = (ctypes.c_double * f0_length)()
+        self.world_dll.Dio((ctypes.c_double * len(x))(*x), len(x), fs, ctypes.byref(option), tpos, f0)
+        return np.array(f0, dtype=np.float32), np.array(tpos, dtype=np.float32)
+    def stonemask(self, x, fs, tpos, f0):
+        self.world_dll.StoneMask.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.POINTER(ctypes.c_double)]
+        self.world_dll.StoneMask.restype = None
+        out_f0 = (ctypes.c_double * len(f0))()
+        self.world_dll.StoneMask((ctypes.c_double * len(x))(*x), len(x), fs, (ctypes.c_double * len(tpos))(*tpos), (ctypes.c_double * len(f0))(*f0), len(f0), out_f0)
+        return np.array(out_f0, dtype=np.float32)

RVC/modules/refinegan.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import sys
+import math
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+sys.path.append(os.getcwd())
+from modules.commons import init_weights, get_padding
+class ResBlock(nn.Module):
+    def __init__(self, channels, kernel_size = 7, dilation = (1, 3, 5), leaky_relu_slope = 0.2):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+        self.convs1 = nn.ModuleList([weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=d, padding=get_padding(kernel_size, d))) for d in dilation])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=1, padding=get_padding(kernel_size, 1))) for _ in dilation])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            x = c2(F.leaky_relu(c1(F.leaky_relu(x, self.leaky_relu_slope)), self.leaky_relu_slope)) + x
+        return x
+    def remove_weight_norm(self):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            remove_weight_norm(c1)
+            remove_weight_norm(c2)
+class AdaIN(nn.Module):
+    def __init__(self, *, channels, leaky_relu_slope = 0.2):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(channels))
+        self.activation = nn.LeakyReLU(leaky_relu_slope)
+    def forward(self, x):
+        return self.activation(x + (torch.randn_like(x) * self.weight[None, :, None]))
+class ParallelResBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels, kernel_sizes = (3, 7, 11), dilation = (1, 3, 5), leaky_relu_slope = 0.2):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.input_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=7, stride=1, padding=3)
+        self.input_conv.apply(init_weights)
+        self.blocks = nn.ModuleList([nn.Sequential(AdaIN(channels=out_channels), ResBlock(out_channels, kernel_size=kernel_size, dilation=dilation, leaky_relu_slope=leaky_relu_slope), AdaIN(channels=out_channels)) for kernel_size in kernel_sizes])
+    def forward(self, x):
+        x = self.input_conv(x)
+        return torch.stack([block(x) for block in self.blocks], dim=0).mean(dim=0)
+    def remove_weight_norm(self):
+        remove_weight_norm(self.input_conv)
+        for block in self.blocks:
+            block[1].remove_weight_norm()
+class SineGenerator(nn.Module):
+    def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0):
+        super(SineGenerator, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.merge = nn.Sequential(nn.Linear(self.dim, 1, bias=False), nn.Tanh())
+    def _f02uv(self, f0):
+        return torch.ones_like(f0) * (f0 > self.voiced_threshold)
+    def _f02sine(self, f0_values):
+        rad_values = (f0_values / self.sampling_rate) % 1
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], dtype=f0_values.dtype, device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        tmp_over_one = torch.cumsum(rad_values, 1) % 1
+        tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+        cumsum_shift = torch.zeros_like(rad_values)
+        cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+        return torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
+    def forward(self, f0):
+        with torch.no_grad():
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, dtype=f0.dtype, device=f0.device)
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+            sine_waves = self._f02sine(f0_buf) * self.sine_amp
+            uv = self._f02uv(f0)
+            sine_waves = sine_waves * uv + ((uv * self.noise_std + (1 - uv) * self.sine_amp / 3) * torch.randn_like(sine_waves))
+        return self.merge(sine_waves)
+class RefineGANGenerator(nn.Module):
+    def __init__(self, *, sample_rate = 44100, upsample_rates = (8, 8, 2, 2), leaky_relu_slope = 0.2, num_mels = 128, gin_channels = 256, checkpointing = False, upsample_initial_channel = 512):
+        super().__init__()
+        self.upsample_rates = upsample_rates
+        self.checkpointing = checkpointing
+        self.leaky_relu_slope = leaky_relu_slope
+        self.upp = np.prod(upsample_rates)
+        self.m_source = SineGenerator(sample_rate)
+        self.pre_conv = weight_norm(nn.Conv1d(1, upsample_initial_channel // 2, 7, 1, padding=3))
+        stride_f0s = [math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 for i in range(len(upsample_rates))]
+        channels = upsample_initial_channel
+        self.downsample_blocks = nn.ModuleList([])
+        for i, _ in enumerate(upsample_rates):
+            stride = stride_f0s[i]
+            kernel = 1 if stride == 1 else stride * 2 - stride % 2
+            self.downsample_blocks.append(weight_norm(nn.Conv1d(1, channels // 2 ** (i + 2), kernel, stride, padding=0 if stride == 1 else (kernel - stride) // 2)))
+        self.mel_conv = weight_norm(nn.Conv1d(num_mels, channels // 2, 7, 1, padding=3))
+        self.mel_conv.apply(init_weights)
+        if gin_channels != 0: self.cond = nn.Conv1d(256, channels // 2, 1)
+        self.upsample_blocks = nn.ModuleList([])
+        self.upsample_conv_blocks = nn.ModuleList([])
+        for rate in upsample_rates:
+            new_channels = channels // 2
+            self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear"))
+            self.upsample_conv_blocks.append(ParallelResBlock(in_channels=channels + channels // 4, out_channels=new_channels, kernel_sizes=(3, 7, 11), dilation=(1, 3, 5), leaky_relu_slope=leaky_relu_slope))
+            channels = new_channels
+        self.conv_post = weight_norm(nn.Conv1d(channels, 1, 7, 1, padding=3, bias=False))
+        self.conv_post.apply(init_weights)
+    def forward(self, mel, f0, g = None):
+        har_source = self.m_source(F.interpolate(f0.unsqueeze(1), size=mel.shape[-1] * self.upp, mode="linear").transpose(1, 2)).transpose(1, 2)
+        x = F.interpolate(self.pre_conv(har_source), size=mel.shape[-1], mode="linear")
+        mel = self.mel_conv(mel)
+        if g is not None: mel += self.cond(g)
+        x = torch.cat([mel, x], dim=1)
+        for ups, res, down in zip(self.upsample_blocks, self.upsample_conv_blocks, self.downsample_blocks):
+            x = F.leaky_relu(x, self.leaky_relu_slope)
+            x = checkpoint(res, torch.cat([checkpoint(ups, x, use_reentrant=False), down(har_source)], dim=1), use_reentrant=False) if self.training and self.checkpointing else res(torch.cat([ups(x), down(har_source)], dim=1))
+        return torch.tanh(self.conv_post(F.leaky_relu(x, self.leaky_relu_slope)))
+    def remove_weight_norm(self):
+        remove_weight_norm(self.pre_conv)
+        remove_weight_norm(self.mel_conv)
+        remove_weight_norm(self.conv_post)
+        for block in self.downsample_blocks:
+            block.remove_weight_norm()
+        for block in self.upsample_conv_blocks:
+            block.remove_weight_norm()

RVC/modules/residuals.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import sys
+import torch
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+sys.path.append(os.getcwd())
+from .modules import WaveNet
+from .commons import get_padding, init_weights
+LRELU_SLOPE = 0.1
+def create_conv1d_layer(channels, kernel_size, dilation):
+    return weight_norm(torch.nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation, padding=get_padding(kernel_size, dilation)))
+def apply_mask(tensor, mask):
+    return tensor * mask if mask is not None else tensor
+class ResBlockBase(torch.nn.Module):
+    def __init__(self, channels, kernel_size, dilations):
+        super(ResBlockBase, self).__init__()
+        self.convs1 = torch.nn.ModuleList([create_conv1d_layer(channels, kernel_size, d) for d in dilations])
+        self.convs1.apply(init_weights)
+        self.convs2 = torch.nn.ModuleList([create_conv1d_layer(channels, kernel_size, 1) for _ in dilations])
+        self.convs2.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            x = c2(apply_mask(torch.nn.functional.leaky_relu(c1(apply_mask(torch.nn.functional.leaky_relu(x, LRELU_SLOPE), x_mask)), LRELU_SLOPE), x_mask)) + x
+        return apply_mask(x, x_mask)
+    def remove_weight_norm(self):
+        for conv in self.convs1 + self.convs2:
+            remove_weight_norm(conv)
+class ResBlock(ResBlockBase):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock, self).__init__(channels, kernel_size, dilation)
+class Log(torch.nn.Module):
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+            return y, torch.sum(-y, [1, 2])
+        else: return torch.exp(x) * x_mask
+class Flip(torch.nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse: return x, torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+        else: return x
+class ElementwiseAffine(torch.nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = channels
+        self.m = torch.nn.Parameter(torch.zeros(channels, 1))
+        self.logs = torch.nn.Parameter(torch.zeros(channels, 1))
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse: return ((self.m + torch.exp(self.logs) * x) * x_mask), torch.sum(self.logs * x_mask, [1, 2])
+        else: return (x - self.m) * torch.exp(-self.logs) * x_mask
+class ResidualCouplingBlock(torch.nn.Module):
+    def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0):
+        super(ResidualCouplingBlock, self).__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = torch.nn.ModuleList()
+        for _ in range(n_flows):
+            self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+            self.flows.append(Flip())
+    def forward(self, x, x_mask, g = None, reverse = False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow.forward(x, x_mask, g=g, reverse=reverse)
+        return x
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+    def __prepare_scriptable__(self):
+        for i in range(self.n_flows):
+            for hook in self.flows[i * 2]._forward_pre_hooks.values():
+                if (hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" and hook.__class__.__name__ == "WeightNorm"): torch.nn.utils.remove_weight_norm(self.flows[i * 2])
+        return self
+class ResidualCouplingLayer(torch.nn.Module):
+    def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
+        assert channels % 2 == 0, "Channels/2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+        self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WaveNet(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+        self.post = torch.nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        stats = self.post(self.enc((self.pre(x0) * x_mask), x_mask, g=g)) * x_mask
+        if not self.mean_only: m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+        if not reverse: return torch.cat([x0, (m + x1 * torch.exp(logs) * x_mask)], 1), torch.sum(logs, [1, 2])
+        else: return torch.cat([x0, ((x1 - m) * torch.exp(-logs) * x_mask)], 1)
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()

RVC/modules/rms.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import librosa
+import torch.nn as nn
+class RMSEnergyExtractor(nn.Module):
+    def __init__(self, frame_length=2048, hop_length=512, center=True, pad_mode = "reflect"):
+        super().__init__()
+        self.frame_length = frame_length
+        self.hop_length = hop_length
+        self.center = center
+        self.pad_mode = pad_mode
+    def forward(self, x):
+        assert x.ndim == 2
+        assert x.shape[0] == 1
+        if str(x.device).startswith("ocl"): x = x.contiguous()
+        rms = torch.from_numpy(
+            librosa.feature.rms(
+                y=x.squeeze(0).cpu().numpy(),
+                frame_length=self.frame_length,
+                hop_length=self.hop_length,
+                center=self.center,
+                pad_mode=self.pad_mode
+            )
+        )
+        return rms.squeeze(-2).to(x.device) if not str(x.device).startswith("ocl") else rms.contiguous().squeeze(-2).to(x.device)

RVC/modules/rmvpe.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import os
+import sys
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from librosa.filters import mel
+sys.path.append(os.getcwd())
+from modules import opencl
+N_MELS, N_CLASS = 128, 360
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, momentum=0.01):
+        super(ConvBlockRes, self).__init__()
+        self.conv = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU(), nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU())
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
+            self.is_shortcut = True
+        else: self.is_shortcut = False
+    def forward(self, x):
+        return (self.conv(x) + self.shortcut(x)) if self.is_shortcut else (self.conv(x) + x)
+class ResEncoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01):
+        super(ResEncoderBlock, self).__init__()
+        self.n_blocks = n_blocks
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for _ in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.kernel_size = kernel_size
+        if self.kernel_size is not None: self.pool = nn.AvgPool2d(kernel_size=kernel_size)
+    def forward(self, x):
+        for i in range(self.n_blocks):
+            x = self.conv[i](x)
+        if self.kernel_size is not None: return x, self.pool(x)
+        else: return x
+class Encoder(nn.Module):
+    def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01):
+        super(Encoder, self).__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        for _ in range(self.n_encoders):
+            self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum))
+            in_channels = out_channels
+            out_channels *= 2
+            in_size //= 2
+        self.out_size = in_size
+        self.out_channel = out_channels
+    def forward(self, x):
+        concat_tensors = []
+        x = self.bn(x)
+        for layer in self.layers:
+            t, x = layer(x)
+            concat_tensors.append(t)
+        return x, concat_tensors
+class Intermediate(nn.Module):
+    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+        super(Intermediate, self).__init__()
+        self.layers = nn.ModuleList()
+        self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum))
+        for _ in range(n_inters - 1):
+            self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum))
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class ResDecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+        super(ResDecoderBlock, self).__init__()
+        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+        self.conv1 = nn.Sequential(nn.ConvTranspose2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3), stride=stride, padding=(1, 1), output_padding=out_padding, bias=False), nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU())
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for _ in range(n_blocks - 1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = torch.cat((self.conv1(x), concat_tensor), dim=1)
+        for conv2 in self.conv2:
+            x = conv2(x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList()
+        for _ in range(n_decoders):
+            out_channels = in_channels // 2
+            self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum))
+            in_channels = out_channels
+    def forward(self, x, concat_tensors):
+        for i, layer in enumerate(self.layers):
+            x = layer(x, concat_tensors[-1 - i])
+        return x
+class DeepUnet(nn.Module):
+    def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
+        super(DeepUnet, self).__init__()
+        self.encoder = Encoder(in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels)
+        self.intermediate = Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks)
+        self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks)
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        return self.decoder(self.intermediate(x), concat_tensors)
+class E2E(nn.Module):
+    def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
+        super(E2E, self).__init__()
+        self.unet = DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
+        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+        self.fc = nn.Sequential(BiGRU(3 * 128, 256, n_gru), nn.Linear(512, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()) if n_gru else nn.Sequential(nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid())
+    def forward(self, mel):
+        return self.fc(self.cnn(self.unet(mel.transpose(-1, -2).unsqueeze(1))).transpose(1, 2).flatten(-2))
+class MelSpectrogram(torch.nn.Module):
+    def __init__(self, is_half, n_mel_channels, sample_rate, win_length, hop_length, n_fft=None, mel_fmin=0, mel_fmax=None, clamp=1e-5):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        self.hann_window = {}
+        mel_basis = mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True)
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sample_rate = sample_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+        self.is_half = is_half
+    def forward(self, audio, keyshift=0, speed=1, center=True):
+        factor = 2 ** (keyshift / 12)
+        win_length_new = int(np.round(self.win_length * factor))
+        keyshift_key = str(keyshift) + "_" + str(audio.device)
+        if keyshift_key not in self.hann_window: self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device)
+        n_fft = int(np.round(self.n_fft * factor))
+        hop_length = int(np.round(self.hop_length * speed))
+        if str(audio.device).startswith("ocl"):
+            stft = opencl.STFT(filter_length=n_fft, hop_length=hop_length, win_length=win_length_new).to(audio.device)
+            magnitude = stft.transform(audio, 1e-9)
+        else:
+            fft = torch.stft(audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length_new, window=self.hann_window[keyshift_key], center=center, return_complex=True)
+            magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = magnitude.size(1)
+            if resize < size: magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
+            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+        mel_output = torch.matmul(self.mel_basis, magnitude)
+        if self.is_half: mel_output = mel_output.half()
+        return torch.log(torch.clamp(mel_output, min=self.clamp))
+class RMVPE:
+    def __init__(self, model_path, is_half, device=None):
+        self.resample_kernel = {}
+        self.resample_kernel = {}
+        model = E2E(4, 1, (2, 2))
+        ckpt = torch.load(model_path, map_location="cpu")
+        model.load_state_dict(ckpt)
+        model.eval()
+        if is_half: model = model.half()
+        self.model = model.to(device)
+        self.is_half = is_half
+        self.device = device
+        self.mel_extractor = MelSpectrogram(is_half, N_MELS, 16000, 1024, 160, None, 30, 8000).to(device)
+        cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
+        self.cents_mapping = np.pad(cents_mapping, (4, 4))
+    def mel2hidden(self, mel):
+        with torch.no_grad():
+            n_frames = mel.shape[-1]
+            n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
+            if n_pad > 0: mel = F.pad(mel, (0, n_pad), mode="constant")
+            hidden = self.model(mel.half() if self.is_half else mel.float())
+            return hidden[:, :n_frames]
+    def decode(self, hidden, thred=0.03):
+        f0 = 10 * (2 ** (self.to_local_average_cents(hidden, thred=thred) / 1200))
+        f0[f0 == 10] = 0
+        return f0
+    def infer_from_audio(self, audio, thred=0.03):
+        hidden = self.mel2hidden(self.mel_extractor(torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True))
+        return self.decode((hidden.squeeze(0).cpu().numpy().astype(np.float32) if self.is_half else hidden.squeeze(0).cpu().numpy()), thred=thred)
+    def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
+        hidden = self.mel2hidden(self.mel_extractor(torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True))
+        f0 = self.decode((hidden.squeeze(0).cpu().numpy().astype(np.float32) if self.is_half else hidden.squeeze(0).cpu().numpy()), thred=thred)
+        f0[(f0 < f0_min) | (f0 > f0_max)] = 0
+        return f0
+    def to_local_average_cents(self, salience, thred=0.05):
+        center = np.argmax(salience, axis=1)
+        salience = np.pad(salience, ((0, 0), (4, 4)))
+        center += 4
+        todo_salience, todo_cents_mapping = [], []
+        starts = center - 4
+        ends = center + 5
+        for idx in range(salience.shape[0]):
+            todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
+            todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
+        todo_salience = np.array(todo_salience)
+        devided = np.sum(todo_salience * np.array(todo_cents_mapping), 1) / np.sum(todo_salience, 1)
+        devided[np.max(salience, axis=1) <= thred] = 0
+        return devided
+class BiGRU(nn.Module):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiGRU, self).__init__()
+        self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True)
+    def forward(self, x):
+        try:
+            return self.gru(x)[0]
+        except:
+            torch.backends.cudnn.enabled = False
+            return self.gru(x)[0]

RVC/modules/swipe.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import math
+import numba as nb
+import numpy as np
+from matplotlib import mlab
+from scipy import interpolate
+from decimal import Decimal, ROUND_HALF_UP
+def swipe(x, fs, f0_floor=50, f0_ceil=1100, frame_period=10, sTHR=0.3):
+    plim = np.array([f0_floor, f0_ceil])
+    t = np.arange(0, int(1000 * len(x) / fs / (frame_period) + 1)) * (frame_period / 1000)
+    log2pc = np.arange(np.log2(plim[0]) * 96, np.log2(plim[-1]) * 96)
+    log2pc *= (1 / 96)
+    pc = 2 ** log2pc
+    S = np.zeros((len(pc), len(t)))
+    logWs = [round_matlab(elm) for elm in np.log2(4 * 2 * fs / plim)]
+    ws = 2 ** np.arange(logWs[0], logWs[1] - 1, -1)
+    p0 = 4 * 2 * fs / ws
+    d = 1 + log2pc - np.log2(4 * 2 * fs / ws[0])
+    fERBs = erbs2hz(np.arange(hz2erbs(pc[0] / 4), hz2erbs(fs / 2), 0.1))
+    for i in range(len(ws)):
+        dn = round_matlab(4 * fs / p0[i])
+        X, f, ti = mlab.specgram(x=np.r_[np.zeros(int(ws[i] / 2)), np.r_[x, np.zeros(int(dn + ws[i] / 2))]], NFFT=ws[i], Fs=fs, window=np.hanning(ws[i] + 2)[1:-1], noverlap=max(0, np.round(ws[i] - dn)), mode='complex')
+        ti = np.r_[0, ti[:-1]]
+        M = np.maximum(0, interpolate.interp1d(f, np.abs(X.T), kind='cubic')(fERBs)).T
+        if i == len(ws) - 1:
+            j = np.where(d - (i + 1) > -1)[0]
+            k = np.where(d[j] - (i + 1) < 0)[0]
+        elif i == 0:
+            j = np.where(d - (i + 1) < 1)[0]
+            k = np.where(d[j] - (i + 1) > 0)[0]
+        else:
+            j = np.where(np.abs(d - (i + 1)) < 1)[0]
+            k = np.arange(len(j))
+        Si = pitchStrengthAllCandidates(fERBs, np.sqrt(M), pc[j])
+        Si = interpolate.interp1d(ti, Si, bounds_error=False, fill_value='nan')(t) if Si.shape[1] > 1 else np.full((len(Si), len(t)), np.nan)
+        mu = np.ones(j.shape)
+        mu[k] = 1 - np.abs(d[j[k]] - i - 1)
+        S[j, :] = S[j, :] + np.tile(mu.reshape(-1, 1), (1, Si.shape[1])) * Si
+    p = np.full((S.shape[1], 1), np.nan)
+    s = np.full((S.shape[1], 1), np.nan)
+    for j in range(S.shape[1]):
+        s[j] = np.max(S[:, j])
+        i = np.argmax(S[:, j])
+        if s[j] < sTHR: continue
+        if i == 0: p[j] = pc[0]
+        elif i == len(pc) - 1: p[j] = pc[0]
+        else:
+            I = np.arange(i-1, i+2)
+            tc = 1 / pc[I]
+            ntc = (tc / tc[1] - 1) * 2 * np.pi
+            idx = np.isfinite(S[I, j])
+            c = np.zeros(len(ntc))
+            c += np.nan
+            I_ = I[idx]
+            if len(I_) < 2: c[idx] = (S[I, j])[0] / ntc[0]
+            else: c[idx] = np.polyfit(ntc[idx], (S[I_, j]), 2)
+            pval = np.polyval(c, ((1 / (2 ** np.arange(np.log2(pc[I[0]]), np.log2(pc[I[2]]) + 1 / 12 / 64, 1 / 12 / 64))) / tc[1] - 1) * 2 * np.pi)
+            s[j] = np.max(pval)
+            p[j] = 2 ** (np.log2(pc[I[0]]) + (np.argmax(pval)) / 12 / 64)
+    p = p.flatten()
+    p[np.isnan(p)] = 0
+    return np.array(p, dtype=np.float32), np.array(t, dtype=np.float32)
+def round_matlab(n):
+    return int(Decimal(n).quantize(0, ROUND_HALF_UP))
+def pitchStrengthAllCandidates(f, L, pc):
+    den = np.sqrt(np.sum(L * L, axis=0))
+    den = np.where(den == 0, 2.220446049250313e-16, den)
+    L = L / den
+    S = np.zeros((len(pc), L.shape[1]))
+    for j in range(len(pc)):
+        S[j,:] = pitchStrengthOneCandidate(f, L, pc[j])
+    return S
+def pitchStrengthOneCandidate(f, L, pc):
+    k = np.zeros(len(f))
+    q = f / pc
+    for i in ([1] + sieve(int(np.fix(f[-1] / pc - 0.75)))):
+        a = np.abs(q - i)
+        p = a < 0.25
+        k[p] = np.cos(2 * np.pi * q[p])
+        v = np.logical_and((0.25 < a), (a < 0.75))
+        k[v] = k[v] + np.cos(2 * np.pi * q[v]) / 2
+    k *= np.sqrt(1 / f)
+    k /= np.linalg.norm(k[k>0])
+    return k @ L
+def hz2erbs(hz):
+    return 21.4 * np.log10(1 + hz / 229)
+def erbs2hz(erbs):
+    return (10 ** (erbs / 21.4) - 1) * 229
+def sieve(n):
+    primes = list(range(2, n + 1))
+    num = 2
+    while num < math.sqrt(n):
+        i = num
+        while i <= n:
+            i += num
+            if i in primes: primes.remove(i)
+        for j in primes:
+            if j > num:
+                num = j
+                break
+    return primes
+def stonemask(x, fs, temporal_positions, f0):
+    refined_f0 = np.copy(f0)
+    for i in range(len(temporal_positions)):
+        if f0[i] != 0:
+            refined_f0[i] = get_refined_f0(x, fs, temporal_positions[i], f0[i])
+            if abs(refined_f0[i] - f0[i]) / f0[i] > 0.2: refined_f0[i] = f0[i]
+    return np.array(refined_f0, dtype=np.float32)
+def get_refined_f0(x, fs, current_time, current_f0):
+    f0_initial = current_f0
+    half_window_length = np.ceil(3 * fs / f0_initial / 2)
+    window_length_in_time = (2 * half_window_length + 1) / fs
+    base_time = np.arange(-half_window_length, half_window_length + 1) / fs
+    fft_size = 2 ** math.ceil(math.log((half_window_length * 2 + 1), 2) + 1)
+    base_time = np.array([float("{0:.4f}".format(elm)) for elm in base_time])
+    index_raw = round_matlab_2((current_time + base_time) * fs)
+    window_time = ((index_raw - 1) / fs) - current_time
+    main_window = 0.42 + 0.5 * np.cos(2 * math.pi * window_time / window_length_in_time) + 0.08 * np.cos(4 * math.pi * window_time / window_length_in_time)
+    index = np.array(np.maximum(1, np.minimum(len(x), index_raw)), dtype=int)
+    spectrum = np.fft.fft(x[index - 1] * main_window, fft_size)
+    diff_spectrum = np.fft.fft(x[index - 1] * (-(np.diff(np.r_[0, main_window]) + np.diff(np.r_[main_window, 0])) / 2), fft_size)
+    power_spectrum = np.abs(spectrum) ** 2
+    from sys import float_info
+    power_spectrum[power_spectrum == 0] = float_info.epsilon
+    instantaneous_frequency = (np.arange(fft_size) / fft_size * fs) + (np.real(spectrum) * np.imag(diff_spectrum) - np.imag(spectrum) * np.real(diff_spectrum)) / power_spectrum * fs / 2 / math.pi
+    trim_index = np.array([1, 2])
+    index_list_trim = np.array(round_matlab_2(f0_initial * fft_size / fs * trim_index) + 1, int)
+    amp_list = np.sqrt(power_spectrum[index_list_trim - 1])
+    f0_initial = np.sum(amp_list * instantaneous_frequency[index_list_trim - 1]) / np.sum(amp_list * trim_index)
+    if f0_initial < 0: return 0
+    trim_index = np.array([1, 2, 3, 4, 5, 6])
+    index_list_trim = np.array(round_matlab_2(f0_initial * fft_size / fs * trim_index) + 1, int)
+    amp_list = np.sqrt(power_spectrum[index_list_trim - 1])
+    return np.sum(amp_list * instantaneous_frequency[index_list_trim - 1]) / np.sum(amp_list * trim_index)
+@nb.jit((nb.float64[:],), nopython=True, cache=True)
+def round_matlab_2(x):
+    y = x.copy()
+    y[x > 0] += 0.5
+    y[x <= 0] -= 0.5
+    return y

RVC/modules/synthesizers.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import sys
+import torch
+sys.path.append(os.getcwd())
+from modules.hifigan import HiFiGANGenerator
+from modules.refinegan import RefineGANGenerator
+from modules.residuals import ResidualCouplingBlock
+from modules.mrf_hifigan import HiFiGANMRFGenerator
+from modules.nsf_hifigan import HiFiGANNRFGenerator
+from modules.encoders import TextEncoder, PosteriorEncoder
+from modules.commons import slice_segments, rand_slice_segments
+class Synthesizer(torch.nn.Module):
+    def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr, use_f0, text_enc_hidden_dim=768, vocoder="Default", checkpointing=False, energy=False, **kwargs):
+        super(Synthesizer, self).__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = float(p_dropout)
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        self.spk_embed_dim = spk_embed_dim
+        self.use_f0 = use_f0
+        self.enc_p = TextEncoder(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout), text_enc_hidden_dim, f0=use_f0, energy=energy)
+        if use_f0:
+            if vocoder == "RefineGAN": self.dec = RefineGANGenerator(sample_rate=sr, upsample_rates=upsample_rates, num_mels=inter_channels, checkpointing=checkpointing)
+            elif vocoder in ["MRF-HiFi-GAN", "MRF HiFi-GAN"]: self.dec = HiFiGANMRFGenerator(in_channel=inter_channels, upsample_initial_channel=upsample_initial_channel, upsample_rates=upsample_rates, upsample_kernel_sizes=upsample_kernel_sizes, resblock_kernel_sizes=resblock_kernel_sizes, resblock_dilations=resblock_dilation_sizes, gin_channels=gin_channels, sample_rate=sr, harmonic_num=8, checkpointing=checkpointing)
+            else: self.dec = HiFiGANNRFGenerator(inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, checkpointing=checkpointing)
+        else: self.dec = HiFiGANGenerator(inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+        self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels)
+        self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    @torch.jit.ignore
+    def forward(self, phone, phone_lengths, pitch = None, pitchf = None, y = None, y_lengths = None, ds = None, energy = None):
+        g = self.emb_g(ds).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths, energy)
+        if y is not None:
+            z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+            z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size)
+            return (self.dec(z_slice, slice_segments(pitchf, ids_slice, self.segment_size, 2), g=g) if self.use_f0 else self.dec(z_slice, g=g)), ids_slice, x_mask, y_mask, (z, self.flow(z, y_mask, g=g), m_p, logs_p, m_q, logs_q)
+        else: return None, None, x_mask, None, (None, None, m_p, logs_p, None, None)
+    @torch.jit.export
+    def infer(self, phone, phone_lengths, pitch = None, nsff0 = None, sid = None, energy = None, rate = None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths, energy)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        if rate is not None:
+            assert isinstance(rate, torch.Tensor)
+            head = int(z_p.shape[2] * (1.0 - rate.item()))
+            z_p = z_p[:, :, head:]
+            x_mask = x_mask[:, :, head:]
+            if self.use_f0: nsff0 = nsff0[:, head:]
+        if self.use_f0:
+            z = self.flow(z_p, x_mask, g=g, reverse=True)
+            o = self.dec(z * x_mask, nsff0, g=g)
+        else:
+            z = self.flow(z_p, x_mask, g=g, reverse=True)
+            o = self.dec(z * x_mask, g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)

RVC/modules/torchcrepe.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import torch
+import librosa
+import functools
+import scipy.stats
+import numpy as np
+CENTS_PER_BIN, MAX_FMAX, PITCH_BINS, SAMPLE_RATE, WINDOW_SIZE = 20, 2006, 360, 16000, 1024
+def mean(signals, win_length=9):
+    assert signals.dim() == 2
+    signals = signals.unsqueeze(1)
+    mask = ~torch.isnan(signals)
+    padding = win_length // 2
+    ones_kernel = torch.ones(signals.size(1), 1, win_length, device=signals.device)
+    avg_pooled = torch.nn.functional.conv1d(torch.where(mask, signals, torch.zeros_like(signals)), ones_kernel, stride=1, padding=padding) / torch.nn.functional.conv1d(mask.float(), ones_kernel, stride=1, padding=padding).clamp(min=1)
+    avg_pooled[avg_pooled == 0] = float("nan")
+    return avg_pooled.squeeze(1)
+def median(signals, win_length):
+    assert signals.dim() == 2
+    signals = signals.unsqueeze(1)
+    mask = ~torch.isnan(signals)
+    padding = win_length // 2
+    x = torch.nn.functional.pad(torch.where(mask, signals, torch.zeros_like(signals)), (padding, padding), mode="reflect")
+    mask = torch.nn.functional.pad(mask.float(), (padding, padding), mode="constant", value=0)
+    x = x.unfold(2, win_length, 1)
+    mask = mask.unfold(2, win_length, 1)
+    x = x.contiguous().view(x.size()[:3] + (-1,))
+    mask = mask.contiguous().view(mask.size()[:3] + (-1,))
+    x_sorted, _ = torch.sort(torch.where(mask.bool(), x.float(), float("inf")).to(x), dim=-1)
+    median_pooled = x_sorted.gather(-1, ((mask.sum(dim=-1) - 1) // 2).clamp(min=0).unsqueeze(-1).long()).squeeze(-1)
+    median_pooled[torch.isinf(median_pooled)] = float("nan")
+    return median_pooled.squeeze(1)
+class CREPE_MODEL(torch.nn.Module):
+    def __init__(self, model='full'):
+        super().__init__()
+        in_channels = {"full": [1, 1024, 128, 128, 128, 256], "large": [1, 768, 96, 96, 96, 192], "medium": [1, 512, 64, 64, 64, 128], "small": [1, 256, 32, 32, 32, 64], "tiny": [1, 128, 16, 16, 16, 32]}[model]
+        out_channels = {"full": [1024, 128, 128, 128, 256, 512], "large": [768, 96, 96, 96, 192, 384], "medium": [512, 64, 64, 64, 128, 256], "small": [256, 32, 32, 32, 64, 128], "tiny": [128, 16, 16, 16, 32, 64]}[model]
+        self.in_features = {"full": 2048, "large": 1536, "medium": 1024, "small": 512, "tiny": 256}[model]
+        kernel_sizes = [(512, 1)] + 5 * [(64, 1)]
+        strides = [(4, 1)] + 5 * [(1, 1)]
+        batch_norm_fn = functools.partial(torch.nn.BatchNorm2d, eps=0.0010000000474974513, momentum=0.0)
+        self.conv1 = torch.nn.Conv2d(in_channels=in_channels[0], out_channels=out_channels[0], kernel_size=kernel_sizes[0], stride=strides[0])
+        self.conv1_BN = batch_norm_fn(num_features=out_channels[0])
+        self.conv2 = torch.nn.Conv2d(in_channels=in_channels[1], out_channels=out_channels[1], kernel_size=kernel_sizes[1], stride=strides[1])
+        self.conv2_BN = batch_norm_fn(num_features=out_channels[1])
+        self.conv3 = torch.nn.Conv2d(in_channels=in_channels[2], out_channels=out_channels[2], kernel_size=kernel_sizes[2], stride=strides[2])
+        self.conv3_BN = batch_norm_fn(num_features=out_channels[2])
+        self.conv4 = torch.nn.Conv2d(in_channels=in_channels[3], out_channels=out_channels[3], kernel_size=kernel_sizes[3], stride=strides[3])
+        self.conv4_BN = batch_norm_fn(num_features=out_channels[3])
+        self.conv5 = torch.nn.Conv2d(in_channels=in_channels[4], out_channels=out_channels[4], kernel_size=kernel_sizes[4], stride=strides[4])
+        self.conv5_BN = batch_norm_fn(num_features=out_channels[4])
+        self.conv6 = torch.nn.Conv2d(in_channels=in_channels[5], out_channels=out_channels[5], kernel_size=kernel_sizes[5], stride=strides[5])
+        self.conv6_BN = batch_norm_fn(num_features=out_channels[5])
+        self.classifier = torch.nn.Linear(in_features=self.in_features, out_features=PITCH_BINS)
+    def forward(self, x, embed=False):
+        x = self.embed(x)
+        if embed: return x
+        return torch.sigmoid(self.classifier(self.layer(x, self.conv6, self.conv6_BN).permute(0, 2, 1, 3).reshape(-1, self.in_features)))
+    def embed(self, x):
+        x = x[:, None, :, None]
+        return self.layer(self.layer(self.layer(self.layer(self.layer(x, self.conv1, self.conv1_BN, (0, 0, 254, 254)), self.conv2, self.conv2_BN), self.conv3, self.conv3_BN), self.conv4, self.conv4_BN), self.conv5, self.conv5_BN)
+    def layer(self, x, conv, batch_norm, padding=(0, 0, 31, 32)):
+        return torch.nn.functional.max_pool2d(batch_norm(torch.nn.functional.relu(conv(torch.nn.functional.pad(x, padding)))), (2, 1), (2, 1))
+class CREPE:
+    def __init__(self, model_path, model_size="full", hop_length=512, batch_size=None, f0_min=50, f0_max=1100, device=None, sample_rate=16000, return_periodicity=False):
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.hop_length = hop_length
+        self.batch_size = batch_size
+        self.sample_rate = sample_rate
+        self.f0_min = f0_min
+        self.f0_max = f0_max
+        self.return_periodicity = return_periodicity
+        model = CREPE_MODEL(model_size)
+        ckpt = torch.load(model_path, map_location="cpu")
+        model.load_state_dict(ckpt)
+        model.eval()
+        self.model = model.to(device)
+    def bins_to_frequency(self, bins):
+        if str(bins.device).startswith("ocl"): bins = bins.to(torch.float32)
+        cents = CENTS_PER_BIN * bins + 1997.3794084376191
+        return 10 * 2 ** ((cents + cents.new_tensor(scipy.stats.triang.rvs(c=0.5, loc=-CENTS_PER_BIN, scale=2 * CENTS_PER_BIN, size=cents.size()))) / 1200)
+    def frequency_to_bins(self, frequency, quantize_fn=torch.floor):
+        return quantize_fn(((1200 * torch.log2(frequency / 10)) - 1997.3794084376191) / CENTS_PER_BIN).int()
+    def viterbi(self, logits):
+        if not hasattr(self, 'transition'):
+            xx, yy = np.meshgrid(range(360), range(360))
+            transition = np.maximum(12 - abs(xx - yy), 0)
+            self.transition = transition / transition.sum(axis=1, keepdims=True)
+        with torch.no_grad():
+            probs = torch.nn.functional.softmax(logits, dim=1)
+        bins = torch.tensor(np.array([librosa.sequence.viterbi(sequence, self.transition).astype(np.int64) for sequence in probs.cpu().numpy()]), device=probs.device)
+        return bins, self.bins_to_frequency(bins)
+    def preprocess(self, audio, pad=True):
+        hop_length = (self.sample_rate // 100) if self.hop_length is None else self.hop_length
+        if self.sample_rate != SAMPLE_RATE:
+            audio = torch.tensor(librosa.resample(audio.detach().cpu().numpy().squeeze(0), orig_sr=self.sample_rate, target_sr=SAMPLE_RATE, res_type="soxr_vhq"), device=audio.device).unsqueeze(0)
+            hop_length = int(hop_length * SAMPLE_RATE / self.sample_rate)
+        if pad:
+            total_frames = 1 + int(audio.size(1) // hop_length)
+            audio = torch.nn.functional.pad(audio, (WINDOW_SIZE // 2, WINDOW_SIZE // 2))
+        else: total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length)
+        batch_size = total_frames if self.batch_size is None else self.batch_size
+        for i in range(0, total_frames, batch_size):
+            frames = torch.nn.functional.unfold(audio[:, None, None, max(0, i * hop_length):min(audio.size(1), (i + batch_size - 1) * hop_length + WINDOW_SIZE)], kernel_size=(1, WINDOW_SIZE), stride=(1, hop_length))
+            if self.device.startswith("ocl"):
+                frames = frames.transpose(1, 2).contiguous().reshape(-1, WINDOW_SIZE).to(self.device)
+            else:
+                frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE).to(self.device)
+            frames -= frames.mean(dim=1, keepdim=True)
+            frames /= torch.max(torch.tensor(1e-10, device=frames.device), frames.std(dim=1, keepdim=True))
+            yield frames
+    def periodicity(self, probabilities, bins):
+        probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)
+        periodicity = probs_stacked.gather(1, bins.reshape(-1, 1).to(torch.int64))
+        return periodicity.reshape(probabilities.size(0), probabilities.size(2))
+    def postprocess(self, probabilities):
+        probabilities = probabilities.detach()
+        probabilities[:, :self.frequency_to_bins(torch.tensor(self.f0_min))] = -float('inf')
+        probabilities[:, self.frequency_to_bins(torch.tensor(self.f0_max), torch.ceil):] = -float('inf')
+        bins, pitch = self.viterbi(probabilities)
+        if not self.return_periodicity: return pitch
+        return pitch, self.periodicity(probabilities, bins)
+    def compute_f0(self, audio, pad=True):
+        results = []
+        for frames in self.preprocess(audio, pad):
+            with torch.no_grad():
+                model = self.model(
+                    frames,
+                    embed=False
+                ).reshape(audio.size(0), -1, PITCH_BINS).transpose(1, 2)
+            result = self.postprocess(model)
+            results.append((result[0].to(audio.device), result[1].to(audio.device)) if isinstance(result, tuple) else result.to(audio.device))
+        if self.return_periodicity:
+            pitch, periodicity = zip(*results)
+            return torch.cat(pitch, 1), torch.cat(periodicity, 1)
+        return torch.cat(results, 1)

RVC/modules/torchfcpe.py ADDED Viewed

	@@ -0,0 +1,951 @@

+import os
+import sys
+import math
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import einsum
+from functools import partial
+from librosa.filters import mel
+from torchaudio.transforms import Resample
+from einops import rearrange, repeat, pack, unpack
+from torch.nn.utils.parametrizations import weight_norm
+sys.path.append(os.getcwd())
+from modules import opencl
+os.environ["LRU_CACHE_CAPACITY"] = "3"
+def spawn_wav2mel(args, device = None):
+    _type = args.mel.type
+    if (str(_type).lower() == 'none') or (str(_type).lower() == 'default'): _type = 'default'
+    elif str(_type).lower() == 'stft': _type = 'stft'
+    wav2mel = Wav2MelModule(sr=args.mel.sr, n_mels=args.mel.num_mels, n_fft=args.mel.n_fft, win_size=args.mel.win_size, hop_length=args.mel.hop_size, fmin=args.mel.fmin, fmax=args.mel.fmax, clip_val=1e-05, mel_type=_type)
+    return wav2mel.to(torch.device(device))
+def calc_same_padding(kernel_size):
+    pad = kernel_size // 2
+    return (pad, pad - (kernel_size + 1) % 2)
+def l2_regularization(model, l2_alpha):
+    l2_loss = []
+    for module in model.modules():
+        if type(module) is nn.Conv2d: l2_loss.append((module.weight**2).sum() / 2.0)
+    return l2_alpha * sum(l2_loss)
+def torch_interp(x, xp, fp):
+    sort_idx = torch.argsort(xp)
+    xp = xp[sort_idx]
+    fp = fp[sort_idx]
+    right_idxs = torch.searchsorted(xp, x).clamp(max=len(xp) - 1)
+    left_idxs = (right_idxs - 1).clamp(min=0)
+    x_left = xp[left_idxs]
+    y_left = fp[left_idxs]
+    interp_vals = y_left + ((x - x_left) * (fp[right_idxs] - y_left) / (xp[right_idxs] - x_left))
+    interp_vals[x < xp[0]] = fp[0]
+    interp_vals[x > xp[-1]] = fp[-1]
+    return interp_vals
+def batch_interp_with_replacement_detach(uv, f0):
+    result = f0.clone()
+    for i in range(uv.shape[0]):
+        interp_vals = torch_interp(torch.where(uv[i])[-1], torch.where(~uv[i])[-1], f0[i][~uv[i]]).detach()
+        result[i][uv[i]] = interp_vals
+    return result
+def ensemble_f0(f0s, key_shift_list, tta_uv_penalty):
+    device = f0s.device
+    f0s = f0s / (torch.pow(2, torch.tensor(key_shift_list, device=device).to(device).unsqueeze(0).unsqueeze(0) / 12))
+    notes = torch.log2(f0s / 440) * 12 + 69
+    notes[notes < 0] = 0
+    uv_penalty = tta_uv_penalty**2
+    dp = torch.zeros_like(notes, device=device)
+    backtrack = torch.zeros_like(notes, device=device).long()
+    dp[:, 0, :] = (notes[:, 0, :] <= 0) * uv_penalty
+    for t in range(1, notes.size(1)):
+        penalty = torch.zeros([notes.size(0), notes.size(2), notes.size(2)], device=device)
+        t_uv = notes[:, t, :] <= 0
+        penalty += uv_penalty * t_uv.unsqueeze(1)
+        t1_uv = notes[:, t - 1, :] <= 0
+        l2 = torch.pow((notes[:, t - 1, :].unsqueeze(-1) - notes[:, t, :].unsqueeze(1)) * (~t1_uv).unsqueeze(-1) * (~t_uv).unsqueeze(1), 2) - 0.5
+        l2 = l2 * (l2 > 0)
+        penalty += l2
+        penalty += t1_uv.unsqueeze(-1) * (~t_uv).unsqueeze(1) * uv_penalty * 2
+        min_value, min_indices = torch.min(dp[:, t - 1, :].unsqueeze(-1) + penalty, dim=1)
+        dp[:, t, :] = min_value
+        backtrack[:, t, :] = min_indices
+    t = f0s.size(1) - 1
+    f0_result = torch.zeros_like(f0s[:, :, 0], device=device)
+    min_indices = torch.argmin(dp[:, t, :], dim=-1)
+    for i in range(0, t + 1):
+        f0_result[:, t - i] = f0s[:, t - i, min_indices]
+        min_indices = backtrack[:, t - i, min_indices]
+    return f0_result.unsqueeze(-1)
+def exists(val):
+    return val is not None
+def default(value, d):
+    return value if exists(value) else d
+def empty(tensor):
+    return tensor.numel() == 0
+def pad_to_multiple(tensor, multiple, dim=-1, value=0):
+    seqlen = tensor.shape[dim]
+    m = seqlen / multiple
+    if m.is_integer(): return False, tensor
+    return True, F.pad(tensor, (*((0,) * (-1 - dim) * 2), 0, (math.ceil(m) * multiple - seqlen)), value = value)
+def look_around(x, backward = 1, forward = 0, pad_value = -1, dim = 2):
+    t = x.shape[1]
+    dims = (len(x.shape) - dim) * (0, 0)
+    padded_x = F.pad(x, (*dims, backward, forward), value = pad_value)
+    return torch.cat([padded_x[:, ind:(ind + t), ...] for ind in range(forward + backward + 1)], dim = dim)
+def rotate_half(x):
+    x1, x2 = rearrange(x, 'b ... (r d) -> b ... r d', r = 2).unbind(dim = -2)
+    return torch.cat((-x2, x1), dim = -1)
+def apply_rotary_pos_emb(q, k, freqs, scale = 1):
+    q_len = q.shape[-2]
+    q_freqs = freqs[..., -q_len:, :]
+    inv_scale = scale ** -1
+    if scale.ndim == 2: scale = scale[-q_len:, :]
+    q = (q * q_freqs.cos() * scale) + (rotate_half(q) * q_freqs.sin() * scale)
+    k = (k * freqs.cos() * inv_scale) + (rotate_half(k) * freqs.sin() * inv_scale)
+    return q, k
+def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
+    unstructured_block = torch.randn((cols, cols), device=device)
+    q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
+    q, r = map(lambda t: t.to(device), (q, r))
+    if qr_uniform_q:
+        d = torch.diag(r, 0)
+        q *= d.sign()
+    return q.t()
+def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None):
+    nb_full_blocks = int(nb_rows / nb_columns)
+    block_list = []
+    for _ in range(nb_full_blocks):
+        block_list.append(orthogonal_matrix_chunk(nb_columns, qr_uniform_q=qr_uniform_q, device=device))
+    remaining_rows = nb_rows - nb_full_blocks * nb_columns
+    if remaining_rows > 0: block_list.append(orthogonal_matrix_chunk(nb_columns, qr_uniform_q=qr_uniform_q, device=device)[:remaining_rows])
+    if scaling == 0: multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
+    elif scaling == 1: multiplier = math.sqrt((float(nb_columns))) * torch.ones((nb_rows,), device=device)
+    else: raise ValueError
+    return torch.diag(multiplier) @ torch.cat(block_list)
+def linear_attention(q, k, v):
+    return einsum("...ed,...nd->...ne", k, q) if v is None else einsum("...de,...nd,...n->...ne", einsum("...nd,...ne->...de", k, v), q, 1.0 / (einsum("...nd,...d->...n", q, k.sum(dim=-2).type_as(q)) + 1e-8))
+def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None):
+    b, h, *_ = data.shape
+    data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
+    ratio = projection_matrix.shape[0] ** -0.5
+    data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), repeat(projection_matrix, "j d -> b h j d", b=b, h=h).type_as(data))
+    diag_data = ((torch.sum(data**2, dim=-1) / 2.0) * (data_normalizer**2)).unsqueeze(dim=-1)
+    return (ratio * (torch.exp(data_dash - diag_data - torch.max(data_dash, dim=-1, keepdim=True).values) + eps) if is_query else ratio * (torch.exp(data_dash - diag_data + eps))).type_as(data)
+class SinusoidalEmbeddings(nn.Module):
+    def __init__(self, dim, scale_base = None, use_xpos = False, theta = 10000):
+        super().__init__()
+        inv_freq = 1. / (theta ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+        self.use_xpos = use_xpos
+        self.scale_base = scale_base
+        assert not (use_xpos and not exists(scale_base))
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.register_buffer('scale', scale, persistent = False)
+    def forward(self, x):
+        seq_len, device = x.shape[-2], x.device
+        t = torch.arange(seq_len, device = x.device).type_as(self.inv_freq)
+        freqs = torch.einsum('i , j -> i j', t, self.inv_freq)
+        freqs =  torch.cat((freqs, freqs), dim = -1)
+        if not self.use_xpos: return freqs, torch.ones(1, device = device)
+        power = (t - (seq_len // 2)) / self.scale_base
+        scale = self.scale ** rearrange(power, 'n -> n 1')
+        return freqs, torch.cat((scale, scale), dim = -1)
+class LocalAttention(nn.Module):
+    def __init__(self, window_size, causal = False, look_backward = 1, look_forward = None, dropout = 0., shared_qk = False, rel_pos_emb_config = None, dim = None, autopad = False, exact_windowsize = False, scale = None, use_rotary_pos_emb = True, use_xpos = False, xpos_scale_base = None):
+        super().__init__()
+        look_forward = default(look_forward, 0 if causal else 1)
+        assert not (causal and look_forward > 0)
+        self.scale = scale
+        self.window_size = window_size
+        self.autopad = autopad
+        self.exact_windowsize = exact_windowsize
+        self.causal = causal
+        self.look_backward = look_backward
+        self.look_forward = look_forward
+        self.dropout = nn.Dropout(dropout)
+        self.shared_qk = shared_qk
+        self.rel_pos = None
+        self.use_xpos = use_xpos
+        if use_rotary_pos_emb and (exists(rel_pos_emb_config) or exists(dim)):
+            if exists(rel_pos_emb_config): dim = rel_pos_emb_config[0]
+            self.rel_pos = SinusoidalEmbeddings(dim, use_xpos = use_xpos, scale_base = default(xpos_scale_base, window_size // 2))
+    def forward(self, q, k, v, mask = None, input_mask = None, attn_bias = None, window_size = None):
+        mask = default(mask, input_mask)
+        assert not (exists(window_size) and not self.use_xpos)
+        _, autopad, pad_value, window_size, causal, look_backward, look_forward, shared_qk = q.shape, self.autopad, -1, default(window_size, self.window_size), self.causal, self.look_backward, self.look_forward, self.shared_qk
+        (q, packed_shape), (k, _), (v, _) = map(lambda t: pack([t], '* n d'), (q, k, v))
+        if autopad:
+            orig_seq_len = q.shape[1]
+            (_, q), (_, k), (_, v) = map(lambda t: pad_to_multiple(t, self.window_size, dim = -2), (q, k, v))
+        b, n, dim_head, device, dtype = *q.shape, q.device, q.dtype
+        scale = default(self.scale, dim_head ** -0.5)
+        assert (n % window_size) == 0
+        windows = n // window_size
+        if shared_qk: k = F.normalize(k, dim = -1).type(k.dtype)
+        seq = torch.arange(n, device = device)
+        b_t = rearrange(seq, '(w n) -> 1 w n', w = windows, n = window_size)
+        bq, bk, bv = map(lambda t: rearrange(t, 'b (w n) d -> b w n d', w = windows), (q, k, v))
+        bq = bq * scale
+        look_around_kwargs = dict(backward =  look_backward, forward =  look_forward, pad_value = pad_value)
+        bk = look_around(bk, **look_around_kwargs)
+        bv = look_around(bv, **look_around_kwargs)
+        if exists(self.rel_pos):
+            pos_emb, xpos_scale = self.rel_pos(bk)
+            bq, bk = apply_rotary_pos_emb(bq, bk, pos_emb, scale = xpos_scale)
+        bq_t = b_t
+        bq_k = look_around(b_t, **look_around_kwargs)
+        bq_t = rearrange(bq_t, '... i -> ... i 1')
+        bq_k = rearrange(bq_k, '... j -> ... 1 j')
+        pad_mask = bq_k == pad_value
+        sim = einsum('b h i e, b h j e -> b h i j', bq, bk)
+        if exists(attn_bias):
+            heads = attn_bias.shape[0]
+            assert (b % heads) == 0
+            attn_bias = repeat(attn_bias, 'h i j -> (b h) 1 i j', b = b // heads)
+            sim = sim + attn_bias
+        mask_value = -torch.finfo(sim.dtype).max
+        if shared_qk:
+            self_mask = bq_t == bq_k
+            sim = sim.masked_fill(self_mask, -5e4)
+            del self_mask
+        if causal:
+            causal_mask = bq_t < bq_k
+            if self.exact_windowsize: causal_mask = causal_mask | (bq_t > (bq_k + (self.window_size * self.look_backward)))
+            sim = sim.masked_fill(causal_mask, mask_value)
+            del causal_mask
+        sim = sim.masked_fill(((bq_k - (self.window_size * self.look_forward)) > bq_t) | (bq_t > (bq_k + (self.window_size * self.look_backward))) | pad_mask, mask_value) if not causal and self.exact_windowsize else sim.masked_fill(pad_mask, mask_value)
+        if exists(mask):
+            batch = mask.shape[0]
+            assert (b % batch) == 0
+            h = b // mask.shape[0]
+            if autopad: _, mask = pad_to_multiple(mask, window_size, dim = -1, value = False)
+            mask = repeat(rearrange(look_around(rearrange(mask, '... (w n) -> (...) w n', w = windows, n = window_size), **{**look_around_kwargs, 'pad_value': False}), '... j -> ... 1 j'), 'b ... -> (b h) ...', h = h)
+            sim = sim.masked_fill(~mask, mask_value)
+            del mask
+        out = rearrange(einsum('b h i j, b h j e -> b h i e', self.dropout(sim.softmax(dim = -1)), bv), 'b w n d -> b (w n) d')
+        if autopad: out = out[:, :orig_seq_len, :]
+        out, *_ = unpack(out, packed_shape, '* n d')
+        return out
+class FastAttention(nn.Module):
+    def __init__(self, dim_heads, nb_features=None, ortho_scaling=0, causal=False, generalized_attention=False, kernel_fn=nn.ReLU(), qr_uniform_q=False, no_projection=False):
+        super().__init__()
+        nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
+        self.dim_heads = dim_heads
+        self.nb_features = nb_features
+        self.ortho_scaling = ortho_scaling
+        self.create_projection = partial(gaussian_orthogonal_random_matrix, nb_rows=self.nb_features, nb_columns=dim_heads, scaling=ortho_scaling, qr_uniform_q=qr_uniform_q)
+        projection_matrix = self.create_projection()
+        self.register_buffer("projection_matrix", projection_matrix)
+        self.generalized_attention = generalized_attention
+        self.kernel_fn = kernel_fn
+        self.no_projection = no_projection
+        self.causal = causal
+    @torch.no_grad()
+    def redraw_projection_matrix(self):
+        projections = self.create_projection()
+        self.projection_matrix.copy_(projections)
+        del projections
+    def forward(self, q, k, v):
+        if self.no_projection: q, k = q.softmax(dim=-1), (torch.exp(k) if self.causal else k.softmax(dim=-2))
+        else:
+            create_kernel = partial(softmax_kernel, projection_matrix=self.projection_matrix, device=q.device)
+            q, k = create_kernel(q, is_query=True), create_kernel(k, is_query=False)
+        attn_fn = linear_attention if not self.causal else self.causal_linear_fn
+        return attn_fn(q, k, None) if v is None else attn_fn(q, k, v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim, causal=False, heads=8, dim_head=64, local_heads=0, local_window_size=256, nb_features=None, feature_redraw_interval=1000, generalized_attention=False, kernel_fn=nn.ReLU(), qr_uniform_q=False, dropout=0.0, no_projection=False):
+        super().__init__()
+        assert dim % heads == 0
+        dim_head = default(dim_head, dim // heads)
+        inner_dim = dim_head * heads
+        self.fast_attention = FastAttention(dim_head, nb_features, causal=causal, generalized_attention=generalized_attention, kernel_fn=kernel_fn, qr_uniform_q=qr_uniform_q, no_projection=no_projection)
+        self.heads = heads
+        self.global_heads = heads - local_heads
+        self.local_attn = (LocalAttention(window_size=local_window_size, causal=causal, autopad=True, dropout=dropout, look_forward=int(not causal), rel_pos_emb_config=(dim_head, local_heads)) if local_heads > 0 else None)
+        self.to_q = nn.Linear(dim, inner_dim)
+        self.to_k = nn.Linear(dim, inner_dim)
+        self.to_v = nn.Linear(dim, inner_dim)
+        self.to_out = nn.Linear(inner_dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    @torch.no_grad()
+    def redraw_projection_matrix(self):
+        self.fast_attention.redraw_projection_matrix()
+    def forward(self, x, context=None, mask=None, context_mask=None, name=None, inference=False, **kwargs):
+        _, _, _, h, gh = *x.shape, self.heads, self.global_heads
+        cross_attend = exists(context)
+        context = default(context, x)
+        context_mask = default(context_mask, mask) if not cross_attend else context_mask
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (self.to_q(x), self.to_k(context), self.to_v(context)))
+        (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
+        attn_outs = []
+        if not empty(q):
+            if exists(context_mask): v.masked_fill_(~context_mask[:, None, :, None], 0.0)
+            if cross_attend: pass
+            else: out = self.fast_attention(q, k, v)
+            attn_outs.append(out)
+        if not empty(lq):
+            assert (not cross_attend), "not cross_attend"
+            out = self.local_attn(lq, lk, lv, input_mask=mask)
+            attn_outs.append(out)
+        return self.dropout(self.to_out(rearrange(torch.cat(attn_outs, dim=1), "b h n d -> b n (h d)")))
+class DotDict(dict):
+    def __getattr__(*args):
+        val = dict.get(*args)
+        return DotDict(val) if type(val) is dict else val
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+class Swish(nn.Module):
+    def forward(self, x):
+        return x * x.sigmoid()
+class Transpose(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        assert len(dims) == 2, "dims == 2"
+        self.dims = dims
+    def forward(self, x):
+        return x.transpose(*self.dims)
+class GLU(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        out, gate = x.chunk(2, dim=self.dim)
+        return out * gate.sigmoid()
+class ConformerConvModule_LEGACY(nn.Module):
+    def __init__(self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0):
+        super().__init__()
+        inner_dim = dim * expansion_factor
+        self.net = nn.Sequential(nn.LayerNorm(dim), Transpose((1, 2)), nn.Conv1d(dim, inner_dim * 2, 1), GLU(dim=1), DepthWiseConv1d_LEGACY(inner_dim, inner_dim, kernel_size=kernel_size, padding=(calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0))), Swish(), nn.Conv1d(inner_dim, dim, 1), Transpose((1, 2)), nn.Dropout(dropout))
+    def forward(self, x):
+        return self.net(x)
+class ConformerConvModule(nn.Module):
+    def __init__(self, dim, expansion_factor=2, kernel_size=31, dropout=0):
+        super().__init__()
+        inner_dim = dim * expansion_factor
+        self.net = nn.Sequential(nn.LayerNorm(dim), Transpose((1, 2)), nn.Conv1d(dim, inner_dim * 2, 1), nn.GLU(dim=1), DepthWiseConv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=calc_same_padding(kernel_size)[0], groups=inner_dim), nn.SiLU(), nn.Conv1d(inner_dim, dim, 1), Transpose((1, 2)), nn.Dropout(dropout))
+    def forward(self, x):
+        return self.net(x)
+class DepthWiseConv1d_LEGACY(nn.Module):
+    def __init__(self, chan_in, chan_out, kernel_size, padding):
+        super().__init__()
+        self.padding = padding
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
+    def forward(self, x):
+        return self.conv(F.pad(x, self.padding))
+class DepthWiseConv1d(nn.Module):
+    def __init__(self, chan_in, chan_out, kernel_size, padding, groups):
+        super().__init__()
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size=kernel_size, padding=padding, groups=groups)
+    def forward(self, x):
+        return self.conv(x)
+class EncoderLayer(nn.Module):
+    def __init__(self, parent):
+        super().__init__()
+        self.conformer = ConformerConvModule_LEGACY(parent.dim_model)
+        self.norm = nn.LayerNorm(parent.dim_model)
+        self.dropout = nn.Dropout(parent.residual_dropout)
+        self.attn = SelfAttention(dim=parent.dim_model, heads=parent.num_heads, causal=False)
+    def forward(self, phone, mask=None):
+        phone = phone + (self.attn(self.norm(phone), mask=mask))
+        return phone + (self.conformer(phone))
+class ConformerNaiveEncoder(nn.Module):
+    def __init__(self, num_layers, num_heads, dim_model, use_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0):
+        super().__init__()
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.dim_model = dim_model
+        self.use_norm = use_norm
+        self.residual_dropout = 0.1
+        self.attention_dropout = 0.1
+        self.encoder_layers = nn.ModuleList([CFNEncoderLayer(dim_model, num_heads, use_norm, conv_only, conv_dropout, atten_dropout) for _ in range(num_layers)])
+    def forward(self, x, mask=None):
+        for (_, layer) in enumerate(self.encoder_layers):
+            x = layer(x, mask)
+        return x
+class CFNEncoderLayer(nn.Module):
+    def __init__(self, dim_model, num_heads = 8, use_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0):
+        super().__init__()
+        self.conformer = nn.Sequential(ConformerConvModule(dim_model), nn.Dropout(conv_dropout)) if conv_dropout > 0 else ConformerConvModule(dim_model)
+        self.norm = nn.LayerNorm(dim_model)
+        self.dropout = nn.Dropout(0.1)
+        self.attn = SelfAttention(dim=dim_model, heads=num_heads, causal=False, use_norm=use_norm, dropout=atten_dropout) if not conv_only else None
+    def forward(self, x, mask=None):
+        if self.attn is not None: x = x + (self.attn(self.norm(x), mask=mask))
+        return x + (self.conformer(x))
+class HannWindow(torch.nn.Module):
+    def __init__(self, win_size):
+        super().__init__()
+        self.register_buffer('window', torch.hann_window(win_size), persistent=False)
+    def forward(self):
+        return self.window
+class MelModule(torch.nn.Module):
+    def __init__(self, sr, n_mels, n_fft, win_size, hop_length, fmin = None, fmax = None, clip_val = 1e-5, out_stft = False):
+        super().__init__()
+        if fmin is None: fmin = 0
+        if fmax is None: fmax = sr / 2
+        self.target_sr = sr
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_length = hop_length
+        self.fmin = fmin
+        self.fmax = fmax
+        self.clip_val = clip_val
+        self.register_buffer('mel_basis', torch.tensor(mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)).float(), persistent=False)
+        self.hann_window = torch.nn.ModuleDict()
+        self.out_stft = out_stft
+    @torch.no_grad()
+    def __call__(self, y, key_shift = 0, speed = 1, center = False, no_cache_window = False):
+        n_fft = self.n_fft
+        win_size = self.win_size
+        hop_length = self.hop_length
+        clip_val = self.clip_val
+        factor = 2 ** (key_shift / 12)
+        n_fft_new = int(np.round(n_fft * factor))
+        win_size_new = int(np.round(win_size * factor))
+        hop_length_new = int(np.round(hop_length * speed))
+        y = y.squeeze(-1)
+        key_shift_key = str(key_shift)
+        if not no_cache_window:
+            if key_shift_key in self.hann_window: hann_window = self.hann_window[key_shift_key]
+            else:
+                hann_window = HannWindow(win_size_new).to(self.mel_basis.device)
+                self.hann_window[key_shift_key] = hann_window
+            hann_window_tensor = hann_window()
+        else: hann_window_tensor = torch.hann_window(win_size_new).to(self.mel_basis.device)
+        pad_left = (win_size_new - hop_length_new) // 2
+        pad_right = max((win_size_new - hop_length_new + 1) // 2, win_size_new - y.size(-1) - pad_left)
+        mode = 'reflect' if pad_right < y.size(-1) else 'constant'
+        pad = F.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode).squeeze(1)
+        if str(y.device).startswith("ocl"):
+            stft = opencl.STFT(filter_length=n_fft_new, hop_length=hop_length_new, win_length=win_size_new).to(y.device)
+            spec = stft.transform(pad, 1e-9)
+        else:
+            spec = torch.stft(pad, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=hann_window_tensor, center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
+            spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-9)
+        if key_shift != 0:
+            size = n_fft // 2 + 1
+            resize = spec.size(1)
+            if resize < size: spec = F.pad(spec, (0, 0, 0, size - resize))
+            spec = spec[:, :size, :] * win_size / win_size_new
+        spec = spec[:, :512, :] if self.out_stft else torch.matmul(self.mel_basis, spec)
+        return torch.log(torch.clamp(spec, min=clip_val) * 1).transpose(-1, -2)
+class Wav2MelModule(torch.nn.Module):
+    def __init__(self, sr, n_mels, n_fft, win_size, hop_length, fmin = None, fmax = None, clip_val = 1e-5, mel_type="default"):
+        super().__init__()
+        if fmin is None: fmin = 0
+        if fmax is None: fmax = sr / 2
+        self.sampling_rate = sr
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_size = hop_length
+        self.fmin = fmin
+        self.fmax = fmax
+        self.clip_val = clip_val
+        self.register_buffer('tensor_device_marker', torch.tensor(1.0).float(), persistent=False)
+        self.resample_kernel = torch.nn.ModuleDict()
+        if mel_type == "default": self.mel_extractor = MelModule(sr, n_mels, n_fft, win_size, hop_length, fmin, fmax, clip_val, out_stft=False)
+        elif mel_type == "stft": self.mel_extractor = MelModule(sr, n_mels, n_fft, win_size, hop_length, fmin, fmax, clip_val, out_stft=True)
+        self.mel_type = mel_type
+    @torch.no_grad()
+    def __call__(self, audio, sample_rate, keyshift = 0, no_cache_window = False):
+        if sample_rate == self.sampling_rate: audio_res = audio
+        else:
+            key_str = str(sample_rate)
+            if key_str not in self.resample_kernel:
+                if len(self.resample_kernel) > 8: self.resample_kernel.clear()
+                self.resample_kernel[key_str] = Resample(sample_rate, self.sampling_rate, lowpass_filter_width=128).to(self.tensor_device_marker.device)
+            audio_res = self.resample_kernel[key_str](audio.squeeze(-1)).unsqueeze(-1)
+        mel = self.mel_extractor(audio_res, keyshift, no_cache_window=no_cache_window)
+        n_frames = int(audio.shape[1] // self.hop_size) + 1
+        if n_frames > int(mel.shape[1]): mel = torch.cat((mel, mel[:, -1:, :]), 1)
+        if n_frames < int(mel.shape[1]): mel = mel[:, :n_frames, :]
+        return mel
+class STFT:
+    def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
+        self.target_sr = sr
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_length = hop_length
+        self.fmin = fmin
+        self.fmax = fmax
+        self.clip_val = clip_val
+        self.mel_basis = {}
+        self.hann_window = {}
+    def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
+        n_fft = self.n_fft
+        win_size = self.win_size
+        hop_length = self.hop_length
+        fmax = self.fmax
+        factor = 2 ** (keyshift / 12)
+        win_size_new = int(np.round(win_size * factor))
+        hop_length_new = int(np.round(hop_length * speed))
+        mel_basis = self.mel_basis if not train else {}
+        hann_window = self.hann_window if not train else {}
+        mel_basis_key = str(fmax) + "_" + str(y.device)
+        if mel_basis_key not in mel_basis: mel_basis[mel_basis_key] = torch.from_numpy(mel(sr=self.target_sr, n_fft=n_fft, n_mels=self.n_mels, fmin=self.fmin, fmax=fmax)).float().to(y.device)
+        keyshift_key = str(keyshift) + "_" + str(y.device)
+        if keyshift_key not in hann_window: hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
+        pad_left = (win_size_new - hop_length_new) // 2
+        pad_right = max((win_size_new - hop_length_new + 1) // 2, win_size_new - y.size(-1) - pad_left)
+        pad = F.pad(y.unsqueeze(1), (pad_left, pad_right), mode="reflect" if pad_right < y.size(-1) else "constant").squeeze(1)
+        n_fft = int(np.round(n_fft * factor))
+        if str(y.device).startswith("ocl"):
+            stft = opencl.STFT(filter_length=n_fft, hop_length=hop_length_new, win_length=win_size_new).to(y.device)
+            spec = stft.transform(pad, 1e-9)
+        else:
+            spec = torch.stft(pad, n_fft, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=True)
+            spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-9)
+        if keyshift != 0:
+            size = n_fft // 2 + 1
+            resize = spec.size(1)
+            spec = (F.pad(spec, (0, 0, 0, size - resize)) if resize < size else spec[:, :size, :]) * win_size / win_size_new
+        return torch.log(torch.clamp(torch.matmul(mel_basis[mel_basis_key], spec), min=self.clip_val) * 1)
+class Wav2Mel:
+    def __init__(self, device=None, dtype=torch.float32):
+        self.sample_rate = 16000
+        self.hop_size = 160
+        if device is None: device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.dtype = dtype
+        self.stft = STFT(16000, 128, 1024, 1024, 160, 0, 8000)
+        self.resample_kernel = {}
+    def extract_nvstft(self, audio, keyshift=0, train=False):
+        return self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
+    def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
+        audio = audio.to(self.dtype).to(self.device)
+        if sample_rate == self.sample_rate: audio_res = audio
+        else:
+            key_str = str(sample_rate)
+            if key_str not in self.resample_kernel: self.resample_kernel[key_str] = Resample(sample_rate, self.sample_rate, lowpass_filter_width=128)
+            self.resample_kernel[key_str] = (self.resample_kernel[key_str].to(self.dtype).to(self.device))
+            audio_res = self.resample_kernel[key_str](audio)
+        mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train)
+        n_frames = int(audio.shape[1] // self.hop_size) + 1
+        mel = (torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel)
+        return mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
+    def __call__(self, audio, sample_rate, keyshift=0, train=False):
+        return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
+class PCmer(nn.Module):
+    def __init__(self, num_layers, num_heads, dim_model, dim_keys, dim_values, residual_dropout, attention_dropout):
+        super().__init__()
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.dim_model = dim_model
+        self.dim_values = dim_values
+        self.dim_keys = dim_keys
+        self.residual_dropout = residual_dropout
+        self.attention_dropout = attention_dropout
+        self._layers = nn.ModuleList([EncoderLayer(self) for _ in range(num_layers)])
+    def forward(self, phone, mask=None):
+        for layer in self._layers:
+            phone = layer(phone, mask)
+        return phone
+class CFNaiveMelPE(nn.Module):
+    def __init__(self, input_channels, out_dims, hidden_dims = 512, n_layers = 6, n_heads = 8, f0_max = 1975.5, f0_min = 32.70, use_fa_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0, use_harmonic_emb = False):
+        super().__init__()
+        self.input_channels = input_channels
+        self.out_dims = out_dims
+        self.hidden_dims = hidden_dims
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.f0_max = f0_max
+        self.f0_min = f0_min
+        self.use_fa_norm = use_fa_norm
+        self.residual_dropout = 0.1
+        self.attention_dropout = 0.1
+        self.harmonic_emb = nn.Embedding(9, hidden_dims) if use_harmonic_emb else None
+        self.input_stack = nn.Sequential(nn.Conv1d(input_channels, hidden_dims, 3, 1, 1), nn.GroupNorm(4, hidden_dims), nn.LeakyReLU(), nn.Conv1d(hidden_dims, hidden_dims, 3, 1, 1))
+        self.net = ConformerNaiveEncoder(num_layers=n_layers, num_heads=n_heads, dim_model=hidden_dims, use_norm=use_fa_norm, conv_only=conv_only, conv_dropout=conv_dropout, atten_dropout=atten_dropout)
+        self.norm = nn.LayerNorm(hidden_dims)
+        self.output_proj = weight_norm(nn.Linear(hidden_dims, out_dims))
+        self.cent_table_b = torch.linspace(self.f0_to_cent(torch.Tensor([f0_min]))[0], self.f0_to_cent(torch.Tensor([f0_max]))[0], out_dims).detach()
+        self.register_buffer("cent_table", self.cent_table_b)
+        self.gaussian_blurred_cent_mask_b = (1200 * torch.log2(torch.Tensor([self.f0_max / 10.])))[0].detach()
+        self.register_buffer("gaussian_blurred_cent_mask", self.gaussian_blurred_cent_mask_b)
+    def forward(self, x, _h_emb=None):
+        x = self.input_stack(x.transpose(-1, -2)).transpose(-1, -2)
+        if self.harmonic_emb is not None: x = x + self.harmonic_emb(torch.LongTensor([0]).to(x.device)) if _h_emb is None else x + self.harmonic_emb(torch.LongTensor([int(_h_emb)]).to(x.device))
+        return torch.sigmoid(self.output_proj(self.norm(self.net(x))))
+    @torch.no_grad()
+    def latent2cents_decoder(self, y, threshold = 0.05, mask = True):
+        B, N, _ = y.size()
+        ci = self.cent_table[None, None, :].expand(B, N, -1)
+        rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(y, dim=-1, keepdim=True)
+        if mask:
+            confident = torch.max(y, dim=-1, keepdim=True)[0]
+            confident_mask = torch.ones_like(confident)
+            confident_mask[confident <= threshold] = float("-INF")
+            rtn = rtn * confident_mask
+        return rtn
+    @torch.no_grad()
+    def latent2cents_local_decoder(self, y, threshold = 0.05, mask = True):
+        B, N, _ = y.size()
+        ci = self.cent_table[None, None, :].expand(B, N, -1)
+        confident, max_index = torch.max(y, dim=-1, keepdim=True)
+        local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
+        local_argmax_index[local_argmax_index < 0] = 0
+        local_argmax_index[local_argmax_index >= self.out_dims] = self.out_dims - 1
+        y_l = torch.gather(y, -1, local_argmax_index)
+        rtn = torch.sum(torch.gather(ci, -1, local_argmax_index) * y_l, dim=-1, keepdim=True) / torch.sum(y_l, dim=-1, keepdim=True)
+        if mask:
+            confident_mask = torch.ones_like(confident)
+            confident_mask[confident <= threshold] = float("-INF")
+            rtn = rtn * confident_mask
+        return rtn
+    @torch.no_grad()
+    def infer(self, mel, decoder = "local_argmax", threshold = 0.05):
+        latent = self.forward(mel)
+        if decoder == "argmax": cents = self.latent2cents_local_decoder
+        elif decoder == "local_argmax": cents = self.latent2cents_local_decoder
+        return self.cent_to_f0(cents(latent, threshold=threshold))
+    @torch.no_grad()
+    def cent_to_f0(self, cent: torch.Tensor) -> torch.Tensor:
+        return 10 * 2 ** (cent / 1200)
+    @torch.no_grad()
+    def f0_to_cent(self, f0):
+        return 1200 * torch.log2(f0 / 10)
+class FCPE_LEGACY(nn.Module):
+    def __init__(self, input_channel=128, out_dims=360, n_layers=12, n_chans=512, loss_mse_scale=10, loss_l2_regularization=False, loss_l2_regularization_scale=1, loss_grad1_mse=False, loss_grad1_mse_scale=1, f0_max=1975.5, f0_min=32.70, confidence=False, threshold=0.05, use_input_conv=True):
+        super().__init__()
+        self.loss_mse_scale = loss_mse_scale
+        self.loss_l2_regularization = loss_l2_regularization
+        self.loss_l2_regularization_scale = loss_l2_regularization_scale
+        self.loss_grad1_mse = loss_grad1_mse
+        self.loss_grad1_mse_scale = loss_grad1_mse_scale
+        self.f0_max = f0_max
+        self.f0_min = f0_min
+        self.confidence = confidence
+        self.threshold = threshold
+        self.use_input_conv = use_input_conv
+        self.cent_table_b = torch.Tensor(np.linspace(self.f0_to_cent(torch.Tensor([f0_min]))[0], self.f0_to_cent(torch.Tensor([f0_max]))[0], out_dims))
+        self.register_buffer("cent_table", self.cent_table_b)
+        self.stack = nn.Sequential(nn.Conv1d(input_channel, n_chans, 3, 1, 1), nn.GroupNorm(4, n_chans), nn.LeakyReLU(), nn.Conv1d(n_chans, n_chans, 3, 1, 1))
+        self.decoder = PCmer(num_layers=n_layers, num_heads=8, dim_model=n_chans, dim_keys=n_chans, dim_values=n_chans, residual_dropout=0.1, attention_dropout=0.1)
+        self.norm = nn.LayerNorm(n_chans)
+        self.n_out = out_dims
+        self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
+    def forward(self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax", output_interp_target_length=None):
+        if cdecoder == "argmax": self.cdecoder = self.cents_decoder
+        elif cdecoder == "local_argmax": self.cdecoder = self.cents_local_decoder
+        x = torch.sigmoid(self.dense_out(self.norm(self.decoder((self.stack(mel.transpose(1, 2)).transpose(1, 2) if self.use_input_conv else mel)))))
+        if not infer:
+            loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, self.gaussian_blurred_cent(self.f0_to_cent(gt_f0)))
+            if self.loss_l2_regularization: loss_all = loss_all + l2_regularization(model=self, l2_alpha=self.loss_l2_regularization_scale)
+            x = loss_all
+        else:
+            x = self.cent_to_f0(self.cdecoder(x))
+            x = (1 + x / 700).log() if not return_hz_f0 else x
+        if output_interp_target_length is not None:
+            x = F.interpolate(torch.where(x == 0, float("nan"), x).transpose(1, 2), size=int(output_interp_target_length), mode="linear").transpose(1, 2)
+            x = torch.where(x.isnan(), float(0.0), x)
+        return x
+    def cents_decoder(self, y, mask=True):
+        B, N, _ = y.size()
+        rtn = torch.sum(self.cent_table[None, None, :].expand(B, N, -1) * y, dim=-1, keepdim=True) / torch.sum(y, dim=-1, keepdim=True)
+        if mask:
+            confident = torch.max(y, dim=-1, keepdim=True)[0]
+            confident_mask = torch.ones_like(confident)
+            confident_mask[confident <= self.threshold] = float("-INF")
+            rtn = rtn * confident_mask
+        return (rtn, confident) if self.confidence else rtn
+    def cents_local_decoder(self, y, mask=True):
+        B, N, _ = y.size()
+        confident, max_index = torch.max(y, dim=-1, keepdim=True)
+        local_argmax_index = torch.clamp(torch.arange(0, 9).to(max_index.device) + (max_index - 4), 0, self.n_out - 1)
+        y_l = torch.gather(y, -1, local_argmax_index)
+        rtn = torch.sum(torch.gather(self.cent_table[None, None, :].expand(B, N, -1), -1, local_argmax_index) * y_l, dim=-1, keepdim=True) / torch.sum(y_l, dim=-1, keepdim=True)
+        if mask:
+            confident_mask = torch.ones_like(confident)
+            confident_mask[confident <= self.threshold] = float("-INF")
+            rtn = rtn * confident_mask
+        return (rtn, confident) if self.confidence else rtn
+    def cent_to_f0(self, cent):
+        return 10.0 * 2 ** (cent / 1200.0)
+    def f0_to_cent(self, f0):
+        return 1200.0 * torch.log2(f0 / 10.0)
+    def gaussian_blurred_cent(self, cents):
+        B, N, _ = cents.size()
+        return torch.exp(-torch.square(self.cent_table[None, None, :].expand(B, N, -1) - cents) / 1250) * (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0))).float()
+class InferCFNaiveMelPE(torch.nn.Module):
+    def __init__(self, args, state_dict):
+        super().__init__()
+        self.wav2mel = spawn_wav2mel(args, device="cpu")
+        self.model = CFNaiveMelPE(input_channels=args.mel.num_mels, out_dims=args.model.out_dims, hidden_dims=args.model.hidden_dims, n_layers=args.model.n_layers, n_heads=args.model.n_heads, f0_max=args.model.f0_max, f0_min=args.model.f0_min, use_fa_norm=args.model.use_fa_norm, conv_only=args.model.conv_only, conv_dropout=args.model.conv_dropout, atten_dropout=args.model.atten_dropout, use_harmonic_emb=False)
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+        self.args_dict = dict(args)
+        self.register_buffer("tensor_device_marker", torch.tensor(1.0).float(), persistent=False)
+    def forward(self, wav, sr, decoder_mode = "local_argmax", threshold = 0.006, key_shifts = [0]):
+        with torch.no_grad():
+            mels = rearrange(torch.stack([self.wav2mel(wav.to(self.tensor_device_marker.device), sr, keyshift=keyshift) for keyshift in key_shifts], -1), "B T C K -> (B K) T C")
+            f0s = rearrange(self.model.infer(mels, decoder=decoder_mode, threshold=threshold), "(B K) T 1 -> B T (K 1)", K=len(key_shifts))
+        return f0s
+    def infer(self, wav, sr, decoder_mode = "local_argmax", threshold = 0.006, f0_min = None, f0_max = None, interp_uv = False, output_interp_target_length = None, return_uv = False, test_time_augmentation = False, tta_uv_penalty = 12.0, tta_key_shifts = [0, -12, 12], tta_use_origin_uv=False):
+        if test_time_augmentation:
+            assert len(tta_key_shifts) > 0
+            flag = 0
+            if tta_use_origin_uv:
+                if 0 not in tta_key_shifts:
+                    flag = 1
+                    tta_key_shifts.append(0)
+            tta_key_shifts.sort(key=lambda x: (x if x >= 0 else -x / 2))
+            f0s = self.__call__(wav, sr, decoder_mode, threshold, tta_key_shifts)
+            f0 = ensemble_f0(f0s[:, :, flag:], tta_key_shifts[flag:], tta_uv_penalty)
+            f0_for_uv = f0s[:, :, [0]] if tta_use_origin_uv else f0
+        else:
+            f0 = self.__call__(wav, sr, decoder_mode, threshold)
+            f0_for_uv = f0
+        if f0_min is None: f0_min = self.args_dict["model"]["f0_min"]
+        uv = (f0_for_uv < f0_min).type(f0_for_uv.dtype)
+        f0 = f0 * (1 - uv)
+        if interp_uv: f0 = batch_interp_with_replacement_detach(uv.squeeze(-1).bool(), f0.squeeze(-1)).unsqueeze(-1)
+        if f0_max is not None: f0[f0 > f0_max] = f0_max
+        if output_interp_target_length is not None:
+            f0 = F.interpolate(torch.where(f0 == 0, float("nan"), f0).transpose(1, 2), size=int(output_interp_target_length), mode="linear").transpose(1, 2)
+            f0 = torch.where(f0.isnan(), float(0.0), f0)
+        if return_uv: return f0, F.interpolate(uv.transpose(1, 2), size=int(output_interp_target_length), mode="nearest").transpose(1, 2)
+        else: return f0
+class FCPEInfer_LEGACY:
+    def __init__(self, model_path, device=None, dtype=torch.float32, f0_min=50, f0_max=1100):
+        if device is None: device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.dtype = dtype
+        self.f0_min = f0_min
+        self.f0_max = f0_max
+        ckpt = torch.load(model_path, map_location=torch.device(self.device))
+        self.args = DotDict(ckpt["config"])
+        model = FCPE_LEGACY(input_channel=self.args.model.input_channel, out_dims=self.args.model.out_dims, n_layers=self.args.model.n_layers, n_chans=self.args.model.n_chans, loss_mse_scale=self.args.loss.loss_mse_scale, loss_l2_regularization=self.args.loss.loss_l2_regularization, loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, loss_grad1_mse=self.args.loss.loss_grad1_mse, loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, f0_max=self.f0_max, f0_min=self.f0_min, confidence=self.args.model.confidence)
+        model.to(self.device).to(self.dtype)
+        model.load_state_dict(ckpt["model"])
+        model.eval()
+        self.model = model
+    @torch.no_grad()
+    def __call__(self, audio, sr, threshold=0.05, p_len=None):
+        self.model.threshold = threshold
+        self.wav2mel = Wav2Mel(device=self.device, dtype=self.dtype)
+        return self.model(mel=self.wav2mel(audio=audio[None, :], sample_rate=sr).to(self.dtype), infer=True, return_hz_f0=True, output_interp_target_length=p_len)
+class FCPEInfer:
+    def __init__(self, model_path, device=None, dtype=torch.float32, f0_min=50, f0_max=1100):
+        if device is None: device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.dtype = dtype
+        self.f0_min = f0_min
+        self.f0_max = f0_max
+        ckpt = torch.load(model_path, map_location=torch.device(device))
+        ckpt["config_dict"]["model"]["conv_dropout"] = ckpt["config_dict"]["model"]["atten_dropout"] = 0.0
+        self.args = DotDict(ckpt["config_dict"])
+        model = InferCFNaiveMelPE(self.args, ckpt["model"])
+        model = model.to(device).to(self.dtype)
+        model.eval()
+        self.model = model
+    @torch.no_grad()
+    def __call__(self, audio, sr, threshold=0.05, p_len=None):
+        return self.model.infer(audio[None, :], sr, threshold=threshold, f0_min=self.f0_min, f0_max=self.f0_max, output_interp_target_length=p_len)
+class FCPE:
+    def __init__(self, model_path, hop_length=512, f0_min=50, f0_max=1100, dtype=torch.float32, device=None, sample_rate=16000, threshold=0.05, legacy=False):
+        self.model = FCPEInfer_LEGACY if legacy else FCPEInfer
+        self.fcpe = self.model(model_path, device=device, dtype=dtype, f0_min=f0_min, f0_max=f0_max)
+        self.hop_length = hop_length
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.threshold = threshold
+        self.sample_rate = sample_rate
+        self.dtype = dtype
+        self.legacy = legacy
+    def compute_f0(self, wav, p_len=None):
+        x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
+        p_len = (x.shape[0] // self.hop_length) if p_len is None else p_len
+        f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold, p_len=p_len)
+        f0 = f0[:] if f0.dim() == 1 else f0[0, :, 0]
+        if torch.all(f0 == 0): return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (f0.cpu().numpy() if p_len is None else np.zeros(p_len))
+        return f0.cpu().numpy()

RVC/modules/utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import gc
+import sys
+import torch
+import codecs
+import librosa
+import requests
+import numpy as np
+import soundfile as sf
+import torch.nn.functional as F
+sys.path.append(os.getcwd())
+from modules import opencl
+def change_rms(source_audio, source_rate, target_audio, target_rate, rate):
+    rms2 = F.interpolate(torch.from_numpy(librosa.feature.rms(y=target_audio, frame_length=target_rate // 2 * 2, hop_length=target_rate // 2)).float().unsqueeze(0), size=target_audio.shape[0], mode="linear").squeeze()
+    return (target_audio * (torch.pow(F.interpolate(torch.from_numpy(librosa.feature.rms(y=source_audio, frame_length=source_rate // 2 * 2, hop_length=source_rate // 2)).float().unsqueeze(0), size=target_audio.shape[0], mode="linear").squeeze(), 1 - rate) * torch.pow(torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6), rate - 1)).numpy())
+def clear_gpu_cache():
+    gc.collect()
+    if torch.cuda.is_available(): torch.cuda.empty_cache()
+    elif torch.backends.mps.is_available(): torch.mps.empty_cache()
+    elif opencl.is_available(): opencl.pytorch_ocl.empty_cache()
+def HF_download_file(url, output_path=None):
+    url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip()
+    output_path = os.path.basename(url) if output_path is None else (os.path.join(output_path, os.path.basename(url)) if os.path.isdir(output_path) else output_path)
+    response = requests.get(url, stream=True, timeout=300)
+    if response.status_code == 200:
+        with open(output_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=10 * 1024 * 1024):
+                f.write(chunk)
+        return output_path
+    else: raise ValueError(response.status_code)
+def check_predictors(method):
+    def download(predictors):
+        if not os.path.exists(os.path.join("models", predictors)):
+           HF_download_file(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cerqvpgbef/", "rot13") + predictors, os.path.join("models", predictors))
+    model_dict = {
+        **dict.fromkeys(["rmvpe", "rmvpe-legacy"], "rmvpe.pt"),
+        **dict.fromkeys(["fcpe"], "fcpe.pt"),
+        **dict.fromkeys(["fcpe-legacy"], "fcpe_legacy.pt"),
+        **dict.fromkeys(["crepe-full", "mangio-crepe-full"], "crepe_full.pth"),
+        **dict.fromkeys(["crepe-large", "mangio-crepe-large"], "crepe_large.pth"),
+        **dict.fromkeys(["crepe-medium", "mangio-crepe-medium"], "crepe_medium.pth"),
+        **dict.fromkeys(["crepe-small", "mangio-crepe-small"], "crepe_small.pth"),
+        **dict.fromkeys(["crepe-tiny", "mangio-crepe-tiny"], "crepe_tiny.pth"),
+    }
+    if method in model_dict: download(model_dict[method])
+def check_embedders(hubert):
+    if hubert in ["contentvec_base", "hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base", "spin"]:
+        hubert += ".pt"
+        model_path = os.path.join("models", hubert)
+        if not os.path.exists(model_path):
+            HF_download_file("".join([codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/rzorqqref/", "rot13"), "fairseq/", hubert]), model_path)
+def load_audio(file, sample_rate=16000):
+    try:
+        file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        if not os.path.isfile(file): raise FileNotFoundError(f"[ERROR] Not found audio: {file}")
+        try:
+            audio, sr = sf.read(file, dtype=np.float32)
+        except:
+            audio, sr = librosa.load(file, sr=None)
+        if len(audio.shape) > 1: audio = librosa.to_mono(audio.T)
+        if sr != sample_rate: audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq")
+    except Exception as e:
+        raise RuntimeError(f"[ERROR] Error reading audio file: {e}")
+    return audio.flatten()
+class Autotune:
+    def __init__(self, ref_freqs):
+        self.ref_freqs = ref_freqs
+        self.note_dict = self.ref_freqs
+    def autotune_f0(self, f0, f0_autotune_strength):
+        autotuned_f0 = np.zeros_like(f0)
+        for i, freq in enumerate(f0):
+            autotuned_f0[i] = freq + (min(self.note_dict, key=lambda x: abs(x - freq)) - freq) * f0_autotune_strength
+        return autotuned_f0