File size: 5,377 Bytes
ad16788 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
#!/usr/bin/env python3
"""Griffin-Lim related modules."""
# Copyright 2019 Tomoki Hayashi
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import logging
from distutils.version import LooseVersion
from functools import partial
from typeguard import check_argument_types
from typing import Optional
import librosa
import numpy as np
EPS = 1e-10
def logmel2linear(
lmspc: np.ndarray,
fs: int,
n_fft: int,
n_mels: int,
fmin: int = None,
fmax: int = None,
) -> np.ndarray:
"""Convert log Mel filterbank to linear spectrogram.
Args:
lmspc: Log Mel filterbank (T, n_mels).
fs: Sampling frequency.
n_fft: The number of FFT points.
n_mels: The number of mel basis.
f_min: Minimum frequency to analyze.
f_max: Maximum frequency to analyze.
Returns:
Linear spectrogram (T, n_fft // 2 + 1).
"""
assert lmspc.shape[1] == n_mels
fmin = 0 if fmin is None else fmin
fmax = fs / 2 if fmax is None else fmax
mspc = np.power(10.0, lmspc)
mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
inv_mel_basis = np.linalg.pinv(mel_basis)
return np.maximum(EPS, np.dot(inv_mel_basis, mspc.T).T)
def griffin_lim(
spc: np.ndarray,
n_fft: int,
n_shift: int,
win_length: int = None,
window: Optional[str] = "hann",
n_iter: Optional[int] = 32,
) -> np.ndarray:
"""Convert linear spectrogram into waveform using Griffin-Lim.
Args:
spc: Linear spectrogram (T, n_fft // 2 + 1).
n_fft: The number of FFT points.
n_shift: Shift size in points.
win_length: Window length in points.
window: Window function type.
n_iter: The number of iterations.
Returns:
Reconstructed waveform (N,).
"""
# assert the size of input linear spectrogram
assert spc.shape[1] == n_fft // 2 + 1
if LooseVersion(librosa.__version__) >= LooseVersion("0.7.0"):
# use librosa's fast Grriffin-Lim algorithm
spc = np.abs(spc.T)
y = librosa.griffinlim(
S=spc,
n_iter=n_iter,
hop_length=n_shift,
win_length=win_length,
window=window,
center=True if spc.shape[1] > 1 else False,
)
else:
# use slower version of Grriffin-Lim algorithm
logging.warning(
"librosa version is old. use slow version of Grriffin-Lim algorithm."
"if you want to use fast Griffin-Lim, please update librosa via "
"`source ./path.sh && pip install librosa==0.7.0`."
)
cspc = np.abs(spc).astype(np.complex).T
angles = np.exp(2j * np.pi * np.random.rand(*cspc.shape))
y = librosa.istft(cspc * angles, n_shift, win_length, window=window)
for i in range(n_iter):
angles = np.exp(
1j
* np.angle(librosa.stft(y, n_fft, n_shift, win_length, window=window))
)
y = librosa.istft(cspc * angles, n_shift, win_length, window=window)
return y
# TODO(kan-bayashi): write as torch.nn.Module
class Spectrogram2Waveform(object):
"""Spectrogram to waveform conversion module."""
def __init__(
self,
n_fft: int,
n_shift: int,
fs: int = None,
n_mels: int = None,
win_length: int = None,
window: Optional[str] = "hann",
fmin: int = None,
fmax: int = None,
griffin_lim_iters: Optional[int] = 32,
):
"""Initialize module.
Args:
fs: Sampling frequency.
n_fft: The number of FFT points.
n_shift: Shift size in points.
n_mels: The number of mel basis.
win_length: Window length in points.
window: Window function type.
f_min: Minimum frequency to analyze.
f_max: Maximum frequency to analyze.
griffin_lim_iters: The number of iterations.
"""
assert check_argument_types()
self.fs = fs
self.logmel2linear = (
partial(
logmel2linear, fs=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
)
if n_mels is not None
else None
)
self.griffin_lim = partial(
griffin_lim,
n_fft=n_fft,
n_shift=n_shift,
win_length=win_length,
window=window,
n_iter=griffin_lim_iters,
)
self.params = dict(
n_fft=n_fft,
n_shift=n_shift,
win_length=win_length,
window=window,
n_iter=griffin_lim_iters,
)
if n_mels is not None:
self.params.update(fs=fs, n_mels=n_mels, fmin=fmin, fmax=fmax)
def __repr__(self):
retval = f"{self.__class__.__name__}("
for k, v in self.params.items():
retval += f"{k}={v}, "
retval += ")"
return retval
def __call__(self, spc):
"""Convert spectrogram to waveform.
Args:
spc: Log Mel filterbank (T, n_mels)
or linear spectrogram (T, n_fft // 2 + 1).
Returns:
Reconstructed waveform (N,).
"""
if self.logmel2linear is not None:
spc = self.logmel2linear(spc)
return self.griffin_lim(spc)
|