voice-clone / spectrum.py
renator's picture
fix build issue and env
aaa69e0
raw
history blame
97.3 kB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Utilities for spectral processing"""
import warnings
import numpy as np
import scipy
import scipy.ndimage
import scipy.signal
import scipy.interpolate
from numba import jit
from . import convert
from .fft import get_fftlib
from .audio import resample
from .._cache import cache
from .. import util
from ..util.exceptions import ParameterError
from ..filters import get_window, semitone_filterbank
from ..filters import window_sumsquare
from numpy.typing import DTypeLike
from typing import Any, Callable, Optional, Tuple, List, Union, overload
from typing_extensions import Literal
from .._typing import _WindowSpec, _PadMode, _PadModeSTFT
__all__ = [
"stft",
"istft",
"magphase",
"iirt",
"reassigned_spectrogram",
"phase_vocoder",
"perceptual_weighting",
"power_to_db",
"db_to_power",
"amplitude_to_db",
"db_to_amplitude",
"fmt",
"pcen",
"griffinlim",
]
@cache(level=20)
def stft(
y: np.ndarray,
*,
n_fft: int = 2048,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
window: _WindowSpec = "hann",
center: bool = True,
dtype: Optional[DTypeLike] = None,
pad_mode: _PadModeSTFT = "constant",
out: Optional[np.ndarray] = None,
) -> np.ndarray:
"""Short-time Fourier transform (STFT).
The STFT represents a signal in the time-frequency domain by
computing discrete Fourier transforms (DFT) over short overlapping
windows.
This function returns a complex-valued matrix D such that
- ``np.abs(D[..., f, t])`` is the magnitude of frequency bin ``f``
at frame ``t``, and
- ``np.angle(D[..., f, t])`` is the phase of frequency bin ``f``
at frame ``t``.
The integers ``t`` and ``f`` can be converted to physical units by means
of the utility functions `frames_to_samples` and `fft_frequencies`.
Parameters
----------
y : np.ndarray [shape=(..., n)], real-valued
input signal. Multi-channel is supported.
n_fft : int > 0 [scalar]
length of the windowed signal after padding with zeros.
The number of rows in the STFT matrix ``D`` is ``(1 + n_fft/2)``.
The default value, ``n_fft=2048`` samples, corresponds to a physical
duration of 93 milliseconds at a sample rate of 22050 Hz, i.e. the
default sample rate in librosa. This value is well adapted for music
signals. However, in speech processing, the recommended value is 512,
corresponding to 23 milliseconds at a sample rate of 22050 Hz.
In any case, we recommend setting ``n_fft`` to a power of two for
optimizing the speed of the fast Fourier transform (FFT) algorithm.
hop_length : int > 0 [scalar]
number of audio samples between adjacent STFT columns.
Smaller values increase the number of columns in ``D`` without
affecting the frequency resolution of the STFT.
If unspecified, defaults to ``win_length // 4`` (see below).
win_length : int <= n_fft [scalar]
Each frame of audio is windowed by ``window`` of length ``win_length``
and then padded with zeros to match ``n_fft``.
Smaller values improve the temporal resolution of the STFT (i.e. the
ability to discriminate impulses that are closely spaced in time)
at the expense of frequency resolution (i.e. the ability to discriminate
pure tones that are closely spaced in frequency). This effect is known
as the time-frequency localization trade-off and needs to be adjusted
according to the properties of the input signal ``y``.
If unspecified, defaults to ``win_length = n_fft``.
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
Either:
- a window specification (string, tuple, or number);
see `scipy.signal.get_window`
- a window function, such as `scipy.signal.windows.hann`
- a vector or array of length ``n_fft``
Defaults to a raised cosine window (`'hann'`), which is adequate for
most applications in audio signal processing.
.. see also:: `filters.get_window`
center : boolean
If ``True``, the signal ``y`` is padded so that frame
``D[:, t]`` is centered at ``y[t * hop_length]``.
If ``False``, then ``D[:, t]`` begins at ``y[t * hop_length]``.
Defaults to ``True``, which simplifies the alignment of ``D`` onto a
time grid by means of `librosa.frames_to_samples`.
Note, however, that ``center`` must be set to `False` when analyzing
signals with `librosa.stream`.
.. see also:: `librosa.stream`
dtype : np.dtype, optional
Complex numeric type for ``D``. Default is inferred to match the
precision of the input signal.
pad_mode : string or function
If ``center=True``, this argument is passed to `np.pad` for padding
the edges of the signal ``y``. By default (``pad_mode="constant"``),
``y`` is padded on both sides with zeros.
.. note:: Not all padding modes supported by `numpy.pad` are supported here.
`wrap`, `mean`, `maximum`, `median`, and `minimum` are not supported.
Other modes that depend at most on input values at the edges of the
signal (e.g., `constant`, `edge`, `linear_ramp`) are supported.
If ``center=False``, this argument is ignored.
.. see also:: `numpy.pad`
out : np.ndarray or None
A pre-allocated, complex-valued array to store the STFT results.
This must be of compatible shape and dtype for the given input parameters.
If `out` is larger than necessary for the provided input signal, then only
a prefix slice of `out` will be used.
If not provided, a new array is allocated and returned.
Returns
-------
D : np.ndarray [shape=(..., 1 + n_fft/2, n_frames), dtype=dtype]
Complex-valued matrix of short-term Fourier transform
coefficients.
If a pre-allocated `out` array is provided, then `D` will be
a reference to `out`.
If `out` is larger than necessary, then `D` will be a sliced
view: `D = out[..., :n_frames]`.
See Also
--------
istft : Inverse STFT
reassigned_spectrogram : Time-frequency reassigned spectrogram
Notes
-----
This function caches at level 20.
Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> S = np.abs(librosa.stft(y))
>>> S
array([[5.395e-03, 3.332e-03, ..., 9.862e-07, 1.201e-05],
[3.244e-03, 2.690e-03, ..., 9.536e-07, 1.201e-05],
...,
[7.523e-05, 3.722e-05, ..., 1.188e-04, 1.031e-03],
[7.640e-05, 3.944e-05, ..., 5.180e-04, 1.346e-03]],
dtype=float32)
Use left-aligned frames, instead of centered frames
>>> S_left = librosa.stft(y, center=False)
Use a shorter hop length
>>> D_short = librosa.stft(y, hop_length=64)
Display a spectrogram
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots()
>>> img = librosa.display.specshow(librosa.amplitude_to_db(S,
... ref=np.max),
... y_axis='log', x_axis='time', ax=ax)
>>> ax.set_title('Power spectrogram')
>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
"""
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
elif not util.is_positive_int(hop_length):
raise ParameterError(f"hop_length={hop_length} must be a positive integer")
# Check audio is valid
util.valid_audio(y, mono=False)
fft_window = get_window(window, win_length, fftbins=True)
# Pad the window out to n_fft size
fft_window = util.pad_center(fft_window, size=n_fft)
# Reshape so that the window can be broadcast
fft_window = util.expand_to(fft_window, ndim=1 + y.ndim, axes=-2)
# Pad the time series so that frames are centered
if center:
if pad_mode in ("wrap", "maximum", "mean", "median", "minimum"):
# Note: padding with a user-provided function "works", but
# use at your own risk.
# Since we don't pass-through kwargs here, any arguments
# to a user-provided pad function should be encapsulated
# by using functools.partial:
#
# >>> my_pad_func = functools.partial(pad_func, foo=x, bar=y)
# >>> librosa.stft(..., pad_mode=my_pad_func)
raise ParameterError(
f"pad_mode='{pad_mode}' is not supported by librosa.stft"
)
if n_fft > y.shape[-1]:
warnings.warn(
f"n_fft={n_fft} is too large for input signal of length={y.shape[-1]}"
)
# Set up the padding array to be empty, and we'll fix the target dimension later
padding = [(0, 0) for _ in range(y.ndim)]
# How many frames depend on left padding?
start_k = int(np.ceil(n_fft // 2 / hop_length))
# What's the first frame that depends on extra right-padding?
tail_k = (y.shape[-1] + n_fft // 2 - n_fft) // hop_length + 1
if tail_k <= start_k:
# If tail and head overlap, then just copy-pad the signal and carry on
start = 0
extra = 0
padding[-1] = (n_fft // 2, n_fft // 2)
y = np.pad(y, padding, mode=pad_mode)
else:
# If tail and head do not overlap, then we can implement padding on each part separately
# and avoid a full copy-pad
# "Middle" of the signal starts here, and does not depend on head padding
start = start_k * hop_length - n_fft // 2
padding[-1] = (n_fft // 2, 0)
# +1 here is to ensure enough samples to fill the window
# fixes bug #1567
y_pre = np.pad(
y[..., : (start_k - 1) * hop_length - n_fft // 2 + n_fft + 1],
padding,
mode=pad_mode,
)
y_frames_pre = util.frame(y_pre, frame_length=n_fft, hop_length=hop_length)
# Trim this down to the exact number of frames we should have
y_frames_pre = y_frames_pre[..., :start_k]
# How many extra frames do we have from the head?
extra = y_frames_pre.shape[-1]
# Determine if we have any frames that will fit inside the tail pad
if tail_k * hop_length - n_fft // 2 + n_fft <= y.shape[-1] + n_fft // 2:
padding[-1] = (0, n_fft // 2)
y_post = np.pad(
y[..., (tail_k) * hop_length - n_fft // 2 :], padding, mode=pad_mode
)
y_frames_post = util.frame(
y_post, frame_length=n_fft, hop_length=hop_length
)
# How many extra frames do we have from the tail?
extra += y_frames_post.shape[-1]
else:
# In this event, the first frame that touches tail padding would run off
# the end of the padded array
# We'll circumvent this by allocating an empty frame buffer for the tail
# this keeps the subsequent logic simple
post_shape = list(y_frames_pre.shape)
post_shape[-1] = 0
y_frames_post = np.empty_like(y_frames_pre, shape=post_shape)
else:
if n_fft > y.shape[-1]:
raise ParameterError(
f"n_fft={n_fft} is too large for uncentered analysis of input signal of length={y.shape[-1]}"
)
# "Middle" of the signal starts at sample 0
start = 0
# We have no extra frames
extra = 0
fft = get_fftlib()
if dtype is None:
dtype = util.dtype_r2c(y.dtype)
# Window the time series.
y_frames = util.frame(y[..., start:], frame_length=n_fft, hop_length=hop_length)
# Pre-allocate the STFT matrix
shape = list(y_frames.shape)
# This is our frequency dimension
shape[-2] = 1 + n_fft // 2
# If there's padding, there will be extra head and tail frames
shape[-1] += extra
if out is None:
stft_matrix = np.zeros(shape, dtype=dtype, order="F")
elif not (np.allclose(out.shape[:-1], shape[:-1]) and out.shape[-1] >= shape[-1]):
raise ParameterError(
f"Shape mismatch for provided output array out.shape={out.shape} and target shape={shape}"
)
elif not np.iscomplexobj(out):
raise ParameterError(f"output with dtype={out.dtype} is not of complex type")
else:
if np.allclose(shape, out.shape):
stft_matrix = out
else:
stft_matrix = out[..., : shape[-1]]
# Fill in the warm-up
if center and extra > 0:
off_start = y_frames_pre.shape[-1]
stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
off_end = y_frames_post.shape[-1]
if off_end > 0:
stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
else:
off_start = 0
n_columns = int(
util.MAX_MEM_BLOCK // (np.prod(y_frames.shape[:-1]) * y_frames.itemsize)
)
n_columns = max(n_columns, 1)
for bl_s in range(0, y_frames.shape[-1], n_columns):
bl_t = min(bl_s + n_columns, y_frames.shape[-1])
stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(
fft_window * y_frames[..., bl_s:bl_t], axis=-2
)
return stft_matrix
@cache(level=30)
def istft(
stft_matrix: np.ndarray,
*,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
n_fft: Optional[int] = None,
window: _WindowSpec = "hann",
center: bool = True,
dtype: Optional[DTypeLike] = None,
length: Optional[int] = None,
out: Optional[np.ndarray] = None,
) -> np.ndarray:
"""
Inverse short-time Fourier transform (ISTFT).
Converts a complex-valued spectrogram ``stft_matrix`` to time-series ``y``
by minimizing the mean squared error between ``stft_matrix`` and STFT of
``y`` as described in [#]_ up to Section 2 (reconstruction from MSTFT).
In general, window function, hop length and other parameters should be same
as in stft, which mostly leads to perfect reconstruction of a signal from
unmodified ``stft_matrix``.
.. [#] D. W. Griffin and J. S. Lim,
"Signal estimation from modified short-time Fourier transform,"
IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
Parameters
----------
stft_matrix : np.ndarray [shape=(..., 1 + n_fft//2, t)]
STFT matrix from ``stft``
hop_length : int > 0 [scalar]
Number of frames between STFT columns.
If unspecified, defaults to ``win_length // 4``.
win_length : int <= n_fft = 2 * (stft_matrix.shape[0] - 1)
When reconstructing the time series, each frame is windowed
and each sample is normalized by the sum of squared window
according to the ``window`` function (see below).
If unspecified, defaults to ``n_fft``.
n_fft : int > 0 or None
The number of samples per frame in the input spectrogram.
By default, this will be inferred from the shape of ``stft_matrix``.
However, if an odd frame length was used, you can specify the correct
length by setting ``n_fft``.
window : string, tuple, number, function, np.ndarray [shape=(n_fft,)]
- a window specification (string, tuple, or number);
see `scipy.signal.get_window`
- a window function, such as `scipy.signal.windows.hann`
- a user-specified window vector of length ``n_fft``
.. see also:: `filters.get_window`
center : boolean
- If ``True``, ``D`` is assumed to have centered frames.
- If ``False``, ``D`` is assumed to have left-aligned frames.
dtype : numeric type
Real numeric type for ``y``. Default is to match the numerical
precision of the input spectrogram.
length : int > 0, optional
If provided, the output ``y`` is zero-padded or clipped to exactly
``length`` samples.
out : np.ndarray or None
A pre-allocated, complex-valued array to store the reconstructed signal
``y``. This must be of the correct shape for the given input parameters.
If not provided, a new array is allocated and returned.
Returns
-------
y : np.ndarray [shape=(..., n)]
time domain signal reconstructed from ``stft_matrix``.
If ``stft_matrix`` contains more than two axes
(e.g., from a stereo input signal), then ``y`` will match shape on the leading dimensions.
See Also
--------
stft : Short-time Fourier Transform
Notes
-----
This function caches at level 30.
Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> D = librosa.stft(y)
>>> y_hat = librosa.istft(D)
>>> y_hat
array([-1.407e-03, -4.461e-04, ..., 5.131e-06, -1.417e-05],
dtype=float32)
Exactly preserving length of the input signal requires explicit padding.
Otherwise, a partial frame at the end of ``y`` will not be represented.
>>> n = len(y)
>>> n_fft = 2048
>>> y_pad = librosa.util.fix_length(y, size=n + n_fft // 2)
>>> D = librosa.stft(y_pad, n_fft=n_fft)
>>> y_out = librosa.istft(D, length=n)
>>> np.max(np.abs(y - y_out))
8.940697e-08
"""
if n_fft is None:
n_fft = 2 * (stft_matrix.shape[-2] - 1)
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
ifft_window = get_window(window, win_length, fftbins=True)
# Pad out to match n_fft, and add broadcasting axes
ifft_window = util.pad_center(ifft_window, size=n_fft)
ifft_window = util.expand_to(ifft_window, ndim=stft_matrix.ndim, axes=-2)
# For efficiency, trim STFT frames according to signal length if available
if length:
if center:
padded_length = length + 2 * (n_fft // 2)
else:
padded_length = length
n_frames = min(stft_matrix.shape[-1], int(np.ceil(padded_length / hop_length)))
else:
n_frames = stft_matrix.shape[-1]
if dtype is None:
dtype = util.dtype_c2r(stft_matrix.dtype)
shape = list(stft_matrix.shape[:-2])
expected_signal_len = n_fft + hop_length * (n_frames - 1)
if length:
expected_signal_len = length
elif center:
expected_signal_len -= 2 * (n_fft // 2)
shape.append(expected_signal_len)
if out is None:
y = np.zeros(shape, dtype=dtype)
elif not np.allclose(out.shape, shape):
raise ParameterError(
f"Shape mismatch for provided output array out.shape={out.shape} != {shape}"
)
else:
y = out
# Since we'll be doing overlap-add here, this needs to be initialized to zero.
y.fill(0.0)
fft = get_fftlib()
if center:
# First frame that does not depend on padding
# k * hop_length - n_fft//2 >= 0
# k * hop_length >= n_fft // 2
# k >= (n_fft//2 / hop_length)
start_frame = int(np.ceil((n_fft // 2) / hop_length))
# Do overlap-add on the head block
ytmp = ifft_window * fft.irfft(stft_matrix[..., :start_frame], n=n_fft, axis=-2)
shape[-1] = n_fft + hop_length * (start_frame - 1)
head_buffer = np.zeros(shape, dtype=dtype)
__overlap_add(head_buffer, ytmp, hop_length)
# If y is smaller than the head buffer, take everything
if y.shape[-1] < shape[-1] - n_fft // 2:
y[..., :] = head_buffer[..., n_fft // 2 : y.shape[-1] + n_fft // 2]
else:
# Trim off the first n_fft//2 samples from the head and copy into target buffer
y[..., : shape[-1] - n_fft // 2] = head_buffer[..., n_fft // 2 :]
# This offset compensates for any differences between frame alignment
# and padding truncation
offset = start_frame * hop_length - n_fft // 2
else:
start_frame = 0
offset = 0
n_columns = int(
util.MAX_MEM_BLOCK // (np.prod(stft_matrix.shape[:-1]) * stft_matrix.itemsize)
)
n_columns = max(n_columns, 1)
frame = 0
for bl_s in range(start_frame, n_frames, n_columns):
bl_t = min(bl_s + n_columns, n_frames)
# invert the block and apply the window function
ytmp = ifft_window * fft.irfft(stft_matrix[..., bl_s:bl_t], n=n_fft, axis=-2)
# Overlap-add the istft block starting at the i'th frame
__overlap_add(y[..., frame * hop_length + offset :], ytmp, hop_length)
frame += bl_t - bl_s
# Normalize by sum of squared window
ifft_window_sum = window_sumsquare(
window=window,
n_frames=n_frames,
win_length=win_length,
n_fft=n_fft,
hop_length=hop_length,
dtype=dtype,
)
if center:
start = n_fft // 2
else:
start = 0
ifft_window_sum = util.fix_length(ifft_window_sum[..., start:], size=y.shape[-1])
approx_nonzero_indices = ifft_window_sum > util.tiny(ifft_window_sum)
y[..., approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices]
return y
@jit(nopython=True, cache=False)
def __overlap_add(y, ytmp, hop_length):
# numba-accelerated overlap add for inverse stft
# y is the pre-allocated output buffer
# ytmp is the windowed inverse-stft frames
# hop_length is the hop-length of the STFT analysis
n_fft = ytmp.shape[-2]
N = n_fft
for frame in range(ytmp.shape[-1]):
sample = frame * hop_length
if N > y.shape[-1] - sample:
N = y.shape[-1] - sample
y[..., sample : (sample + N)] += ytmp[..., :N, frame]
def __reassign_frequencies(
y: np.ndarray,
sr: float = 22050,
S: Optional[np.ndarray] = None,
n_fft: int = 2048,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
window: _WindowSpec = "hann",
center: bool = True,
dtype: Optional[DTypeLike] = None,
pad_mode: _PadModeSTFT = "constant",
) -> Tuple[np.ndarray, np.ndarray]:
"""Instantaneous frequencies based on a spectrogram representation.
The reassignment vector is calculated using equation 5.20 in Flandrin,
Auger, & Chassande-Mottin 2002::
omega_reassigned = omega - np.imag(S_dh/S_h)
where ``S_h`` is the complex STFT calculated using the original window, and
``S_dh`` is the complex STFT calculated using the derivative of the original
window.
See `reassigned_spectrogram` for references.
It is recommended to use ``pad_mode="wrap"`` or else ``center=False``, rather
than the defaults. Frequency reassignment assumes that the energy in each
FFT bin is associated with exactly one signal component. Reflection padding
at the edges of the signal may invalidate the reassigned estimates in the
boundary frames.
Parameters
----------
y : np.ndarray [shape=(..., n,)], real-valued
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
S : np.ndarray [shape=(..., d, t)] or None
(optional) complex STFT calculated using the other arguments provided
to `__reassign_frequencies`
n_fft : int > 0 [scalar]
FFT window size. Defaults to 2048.
hop_length : int > 0 [scalar]
hop length, number samples between subsequent frames.
If not supplied, defaults to ``win_length // 4``.
win_length : int > 0, <= n_fft
Window length. Defaults to ``n_fft``.
See ``stft`` for details.
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
- a window specification (string, tuple, number);
see `scipy.signal.get_window`
- a window function, such as `scipy.signal.windows.hann`
- a user-specified window vector of length ``n_fft``
See `stft` for details.
.. see also:: `filters.get_window`
center : boolean
- If ``True``, the signal ``y`` is padded so that frame
``S[:, t]`` is centered at ``y[t * hop_length]``.
- If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``.
dtype : numeric type
Complex numeric type for ``S``. Default is inferred to match
the numerical precision of the input signal.
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, STFT uses zero padding.
Returns
-------
freqs : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real]
Instantaneous frequencies:
``freqs[f, t]`` is the frequency for bin ``f``, frame ``t``.
S : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=complex]
Short-time Fourier transform
Warns
-----
RuntimeWarning
Frequencies with zero support will produce a divide-by-zero warning and
will be returned as `np.nan`.
See Also
--------
stft : Short-time Fourier Transform
reassigned_spectrogram : Time-frequency reassigned spectrogram
Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> frequencies, S = librosa.core.spectrum.__reassign_frequencies(y, sr=sr)
>>> frequencies
array([[0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00],
[3.628e+00, 4.698e+00, ..., 1.239e+01, 1.072e+01],
...,
[1.101e+04, 1.102e+04, ..., 1.105e+04, 1.102e+04],
[1.102e+04, 1.102e+04, ..., 1.102e+04, 1.102e+04]])
"""
# retrieve window samples if needed so that the window derivative can be
# calculated
if win_length is None:
win_length = n_fft
window = get_window(window, win_length, fftbins=True)
window = util.pad_center(window, size=n_fft)
if S is None:
if dtype is None:
dtype = util.dtype_r2c(y.dtype)
S_h = stft(
y=y,
n_fft=n_fft,
hop_length=hop_length,
window=window,
center=center,
dtype=dtype,
pad_mode=pad_mode,
)
else:
if dtype is None:
dtype = S.dtype
S_h = S
# cyclic gradient to correctly handle edges of a periodic window
window_derivative = util.cyclic_gradient(window)
S_dh = stft(
y=y,
n_fft=n_fft,
hop_length=hop_length,
window=window_derivative,
center=center,
dtype=dtype,
pad_mode=pad_mode,
)
# equation 5.20 of Flandrin, Auger, & Chassande-Mottin 2002
# the sign of the correction is reversed in some papers - see Plante,
# Meyer, & Ainsworth 1998 pp. 283-284
correction = -np.imag(S_dh / S_h)
freqs = convert.fft_frequencies(sr=sr, n_fft=n_fft)
freqs = util.expand_to(freqs, ndim=correction.ndim, axes=-2) + correction * (
0.5 * sr / np.pi
)
return freqs, S_h
def __reassign_times(
y: np.ndarray,
sr: float = 22050,
S: Optional[np.ndarray] = None,
n_fft: int = 2048,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
window: _WindowSpec = "hann",
center: bool = True,
dtype: Optional[DTypeLike] = None,
pad_mode: _PadModeSTFT = "constant",
) -> Tuple[np.ndarray, np.ndarray]:
"""Time reassignments based on a spectrogram representation.
The reassignment vector is calculated using equation 5.23 in Flandrin,
Auger, & Chassande-Mottin 2002::
t_reassigned = t + np.real(S_th/S_h)
where ``S_h`` is the complex STFT calculated using the original window, and
``S_th`` is the complex STFT calculated using the original window multiplied
by the time offset from the window center.
See `reassigned_spectrogram` for references.
It is recommended to use ``pad_mode="constant"`` (zero padding) or else
``center=False``, rather than the defaults. Time reassignment assumes that
the energy in each FFT bin is associated with exactly one impulse event.
Reflection padding at the edges of the signal may invalidate the reassigned
estimates in the boundary frames.
Parameters
----------
y : np.ndarray [shape=(..., n,)], real-valued
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
S : np.ndarray [shape=(..., d, t)] or None
(optional) complex STFT calculated using the other arguments provided
to `__reassign_times`
n_fft : int > 0 [scalar]
FFT window size. Defaults to 2048.
hop_length : int > 0 [scalar]
hop length, number samples between subsequent frames.
If not supplied, defaults to ``win_length // 4``.
win_length : int > 0, <= n_fft
Window length. Defaults to ``n_fft``.
See `stft` for details.
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
- a window specification (string, tuple, number);
see `scipy.signal.get_window`
- a window function, such as `scipy.signal.windows.hann`
- a user-specified window vector of length ``n_fft``
See `stft` for details.
.. see also:: `filters.get_window`
center : boolean
- If ``True``, the signal ``y`` is padded so that frame
``S[:, t]`` is centered at ``y[t * hop_length]``.
- If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``.
dtype : numeric type
Complex numeric type for ``S``. Default is inferred to match
the precision of the input signal.
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, STFT uses zero padding.
Returns
-------
times : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real]
Reassigned times:
``times[f, t]`` is the time for bin ``f``, frame ``t``.
S : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=complex]
Short-time Fourier transform
Warns
-----
RuntimeWarning
Time estimates with zero support will produce a divide-by-zero warning
and will be returned as `np.nan`.
See Also
--------
stft : Short-time Fourier Transform
reassigned_spectrogram : Time-frequency reassigned spectrogram
Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> times, S = librosa.core.spectrum.__reassign_times(y, sr=sr)
>>> times
array([[ 2.268e-05, 1.144e-02, ..., 5.332e+00, 5.333e+00],
[ 2.268e-05, 1.451e-02, ..., 5.334e+00, 5.333e+00],
...,
[ 2.268e-05, -6.177e-04, ..., 5.368e+00, 5.327e+00],
[ 2.268e-05, 1.420e-03, ..., 5.307e+00, 5.328e+00]])
"""
# retrieve window samples if needed so that the time-weighted window can be
# calculated
if win_length is None:
win_length = n_fft
window = get_window(window, win_length, fftbins=True)
window = util.pad_center(window, size=n_fft)
# retrieve hop length if needed so that the frame times can be calculated
if hop_length is None:
hop_length = int(win_length // 4)
if S is None:
if dtype is None:
dtype = util.dtype_r2c(y.dtype)
S_h = stft(
y=y,
n_fft=n_fft,
hop_length=hop_length,
window=window,
center=center,
dtype=dtype,
pad_mode=pad_mode,
)
else:
if dtype is None:
dtype = S.dtype
S_h = S
# calculate window weighted by time
half_width = n_fft // 2
window_times: np.ndarray
if n_fft % 2:
window_times = np.arange(-half_width, half_width + 1)
else:
window_times = np.arange(0.5 - half_width, half_width)
window_time_weighted = window * window_times
S_th = stft(
y=y,
n_fft=n_fft,
hop_length=hop_length,
window=window_time_weighted,
center=center,
dtype=dtype,
pad_mode=pad_mode,
)
# equation 5.23 of Flandrin, Auger, & Chassande-Mottin 2002
# the sign of the correction is reversed in some papers - see Plante,
# Meyer, & Ainsworth 1998 pp. 283-284
correction = np.real(S_th / S_h)
if center:
pad_length = None
else:
pad_length = n_fft
times = convert.frames_to_time(
np.arange(S_h.shape[-1]), sr=sr, hop_length=hop_length, n_fft=pad_length
)
times = util.expand_to(times, ndim=correction.ndim, axes=-1) + correction / sr
return times, S_h
def reassigned_spectrogram(
y: np.ndarray,
*,
sr: float = 22050,
S: Optional[np.ndarray] = None,
n_fft: int = 2048,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
window: _WindowSpec = "hann",
center: bool = True,
reassign_frequencies: bool = True,
reassign_times: bool = True,
ref_power: Union[float, Callable] = 1e-6,
fill_nan: bool = False,
clip: bool = True,
dtype: Optional[DTypeLike] = None,
pad_mode: _PadModeSTFT = "constant",
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
r"""Time-frequency reassigned spectrogram.
The reassignment vectors are calculated using equations 5.20 and 5.23 in
[#]_::
t_reassigned = t + np.real(S_th/S_h)
omega_reassigned = omega - np.imag(S_dh/S_h)
where ``S_h`` is the complex STFT calculated using the original window,
``S_dh`` is the complex STFT calculated using the derivative of the original
window, and ``S_th`` is the complex STFT calculated using the original window
multiplied by the time offset from the window center. See [#]_ for
additional algorithms, and [#]_ and [#]_ for history and discussion of the
method.
.. [#] Flandrin, P., Auger, F., & Chassande-Mottin, E. (2002).
Time-Frequency reassignment: From principles to algorithms. In
Applications in Time-Frequency Signal Processing (Vol. 10, pp.
179-204). CRC Press.
.. [#] Fulop, S. A., & Fitz, K. (2006). Algorithms for computing the
time-corrected instantaneous frequency (reassigned) spectrogram, with
applications. The Journal of the Acoustical Society of America, 119(1),
360. doi:10.1121/1.2133000
.. [#] Auger, F., Flandrin, P., Lin, Y.-T., McLaughlin, S., Meignen, S.,
Oberlin, T., & Wu, H.-T. (2013). Time-Frequency Reassignment and
Synchrosqueezing: An Overview. IEEE Signal Processing Magazine, 30(6),
32-41. doi:10.1109/MSP.2013.2265316
.. [#] Hainsworth, S., Macleod, M. (2003). Time-frequency reassignment: a
review and analysis. Tech. Rep. CUED/FINFENG/TR.459, Cambridge
University Engineering Department
Parameters
----------
y : np.ndarray [shape=(..., n)], real-valued
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
S : np.ndarray [shape=(..., d, t)] or None
(optional) complex STFT calculated using the other arguments provided
to ``reassigned_spectrogram``
n_fft : int > 0 [scalar]
FFT window size. Defaults to 2048.
hop_length : int > 0 [scalar]
hop length, number samples between subsequent frames.
If not supplied, defaults to ``win_length // 4``.
win_length : int > 0, <= n_fft
Window length. Defaults to ``n_fft``.
See `stft` for details.
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
- a window specification (string, tuple, number);
see `scipy.signal.get_window`
- a window function, such as `scipy.signal.windows.hann`
- a user-specified window vector of length ``n_fft``
See `stft` for details.
.. see also:: `filters.get_window`
center : boolean
- If ``True`` (default), the signal ``y`` is padded so that frame
``S[:, t]`` is centered at ``y[t * hop_length]``. See `Notes` for
recommended usage in this function.
- If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``.
reassign_frequencies : boolean
- If ``True`` (default), the returned frequencies will be instantaneous
frequency estimates.
- If ``False``, the returned frequencies will be a read-only view of the
STFT bin frequencies for all frames.
reassign_times : boolean
- If ``True`` (default), the returned times will be corrected
(reassigned) time estimates for each bin.
- If ``False``, the returned times will be a read-only view of the STFT
frame times for all bins.
ref_power : float >= 0 or callable
Minimum power threshold for estimating time-frequency reassignments.
Any bin with ``np.abs(S[f, t])**2 < ref_power`` will be returned as
`np.nan` in both frequency and time, unless ``fill_nan`` is ``True``. If 0
is provided, then only bins with zero power will be returned as
`np.nan` (unless ``fill_nan=True``).
fill_nan : boolean
- If ``False`` (default), the frequency and time reassignments for bins
below the power threshold provided in ``ref_power`` will be returned as
`np.nan`.
- If ``True``, the frequency and time reassignments for these bins will
be returned as the bin center frequencies and frame times.
clip : boolean
- If ``True`` (default), estimated frequencies outside the range
`[0, 0.5 * sr]` or times outside the range `[0, len(y) / sr]` will be
clipped to those ranges.
- If ``False``, estimated frequencies and times beyond the bounds of the
spectrogram may be returned.
dtype : numeric type
Complex numeric type for STFT calculation. Default is inferred to match
the precision of the input signal.
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, STFT uses zero padding.
Returns
-------
freqs, times, mags : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real]
Instantaneous frequencies:
``freqs[..., f, t]`` is the frequency for bin ``f``, frame ``t``.
If ``reassign_frequencies=False``, this will instead be a read-only array
of the same shape containing the bin center frequencies for all frames.
Reassigned times:
``times[..., f, t]`` is the time for bin ``f``, frame ``t``.
If ``reassign_times=False``, this will instead be a read-only array of
the same shape containing the frame times for all bins.
Magnitudes from short-time Fourier transform:
``mags[..., f, t]`` is the magnitude for bin ``f``, frame ``t``.
Warns
-----
RuntimeWarning
Frequency or time estimates with zero support will produce a
divide-by-zero warning, and will be returned as `np.nan` unless
``fill_nan=True``.
See Also
--------
stft : Short-time Fourier Transform
Notes
-----
It is recommended to use ``center=False`` with this function rather than the
librosa default ``True``. Unlike ``stft``, reassigned times are not aligned to
the left or center of each frame, so padding the signal does not affect the
meaning of the reassigned times. However, reassignment assumes that the
energy in each FFT bin is associated with exactly one signal component and
impulse event.
If ``reassign_times`` is ``False``, the frame times that are returned will be
aligned to the left or center of the frame, depending on the value of
``center``. In this case, if ``center`` is ``True``, then ``pad_mode="wrap"`` is
recommended for valid estimation of the instantaneous frequencies in the
boundary frames.
Examples
--------
>>> import matplotlib.pyplot as plt
>>> amin = 1e-10
>>> n_fft = 64
>>> sr = 4000
>>> y = 1e-3 * librosa.clicks(times=[0.3], sr=sr, click_duration=1.0,
... click_freq=1200.0, length=8000) +\
... 1e-3 * librosa.clicks(times=[1.5], sr=sr, click_duration=0.5,
... click_freq=400.0, length=8000) +\
... 1e-3 * librosa.chirp(fmin=200, fmax=1600, sr=sr, duration=2.0) +\
... 1e-6 * np.random.randn(2*sr)
>>> freqs, times, mags = librosa.reassigned_spectrogram(y=y, sr=sr,
... n_fft=n_fft)
>>> mags_db = librosa.amplitude_to_db(mags, ref=np.max)
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> img = librosa.display.specshow(mags_db, x_axis="s", y_axis="linear", sr=sr,
... hop_length=n_fft//4, ax=ax[0])
>>> ax[0].set(title="Spectrogram", xlabel=None)
>>> ax[0].label_outer()
>>> ax[1].scatter(times, freqs, c=mags_db, cmap="magma", alpha=0.1, s=5)
>>> ax[1].set_title("Reassigned spectrogram")
>>> fig.colorbar(img, ax=ax, format="%+2.f dB")
"""
if not callable(ref_power) and ref_power < 0:
raise ParameterError("ref_power must be non-negative or callable.")
if not reassign_frequencies and not reassign_times:
raise ParameterError("reassign_frequencies or reassign_times must be True.")
if win_length is None:
win_length = n_fft
if hop_length is None:
hop_length = int(win_length // 4)
# frequency and time reassignment if requested
if reassign_frequencies:
freqs, S = __reassign_frequencies(
y=y,
sr=sr,
S=S,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
center=center,
dtype=dtype,
pad_mode=pad_mode,
)
if reassign_times:
times, S = __reassign_times(
y=y,
sr=sr,
S=S,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
center=center,
dtype=dtype,
pad_mode=pad_mode,
)
assert S is not None
mags: np.ndarray = np.abs(S)
# clean up reassignment issues: divide-by-zero, bins with near-zero power,
# and estimates outside the spectrogram bounds
# retrieve bin frequencies and frame times to replace missing estimates
if fill_nan or not reassign_frequencies or not reassign_times:
if center:
pad_length = None
else:
pad_length = n_fft
bin_freqs = convert.fft_frequencies(sr=sr, n_fft=n_fft)
frame_times = convert.frames_to_time(
frames=np.arange(S.shape[-1]),
sr=sr,
hop_length=hop_length,
n_fft=pad_length,
)
# find bins below the power threshold
# reassigned bins with zero power will already be NaN
if callable(ref_power):
ref_p = ref_power(mags**2)
else:
ref_p = ref_power
mags_low = np.less(mags, ref_p**0.5, where=~np.isnan(mags))
# for reassigned estimates, optionally set thresholded bins to NaN, return
# bin frequencies and frame times in place of NaN generated by
# divide-by-zero and power threshold, and clip to spectrogram bounds
if reassign_frequencies:
if ref_p > 0:
freqs[mags_low] = np.nan
if fill_nan:
freqs = np.where(np.isnan(freqs), bin_freqs[:, np.newaxis], freqs)
if clip:
np.clip(freqs, 0, sr / 2.0, out=freqs)
# or if reassignment was not requested, return bin frequencies and frame
# times for every cell is the spectrogram
else:
freqs = np.broadcast_to(bin_freqs[:, np.newaxis], S.shape)
if reassign_times:
if ref_p > 0:
times[mags_low] = np.nan
if fill_nan:
times = np.where(np.isnan(times), frame_times[np.newaxis, :], times)
if clip:
np.clip(times, 0, y.shape[-1] / float(sr), out=times)
else:
times = np.broadcast_to(frame_times[np.newaxis, :], S.shape)
return freqs, times, mags
def magphase(D: np.ndarray, *, power: float = 1) -> Tuple[np.ndarray, np.ndarray]:
"""Separate a complex-valued spectrogram D into its magnitude (S)
and phase (P) components, so that ``D = S * P``.
Parameters
----------
D : np.ndarray [shape=(..., d, t), dtype=complex]
complex-valued spectrogram
power : float > 0
Exponent for the magnitude spectrogram,
e.g., 1 for energy, 2 for power, etc.
Returns
-------
D_mag : np.ndarray [shape=(..., d, t), dtype=real]
magnitude of ``D``, raised to ``power``
D_phase : np.ndarray [shape=(..., d, t), dtype=complex]
``exp(1.j * phi)`` where ``phi`` is the phase of ``D``
Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> D = librosa.stft(y)
>>> magnitude, phase = librosa.magphase(D)
>>> magnitude
array([[5.395e-03, 3.332e-03, ..., 9.862e-07, 1.201e-05],
[3.244e-03, 2.690e-03, ..., 9.536e-07, 1.201e-05],
...,
[7.523e-05, 3.722e-05, ..., 1.188e-04, 1.031e-03],
[7.640e-05, 3.944e-05, ..., 5.180e-04, 1.346e-03]],
dtype=float32)
>>> phase
array([[ 1. +0.000e+00j, 1. +0.000e+00j, ...,
-1. -8.742e-08j, -1. -8.742e-08j],
[-1. -8.742e-08j, -0.775-6.317e-01j, ...,
-0.885-4.648e-01j, 0.472-8.815e-01j],
...,
[ 1. -4.342e-12j, 0.028-9.996e-01j, ...,
-0.222-9.751e-01j, -0.75 -6.610e-01j],
[-1. -8.742e-08j, -1. -8.742e-08j, ...,
1. +0.000e+00j, 1. +0.000e+00j]], dtype=complex64)
Or get the phase angle (in radians)
>>> np.angle(phase)
array([[ 0.000e+00, 0.000e+00, ..., -3.142e+00, -3.142e+00],
[-3.142e+00, -2.458e+00, ..., -2.658e+00, -1.079e+00],
...,
[-4.342e-12, -1.543e+00, ..., -1.794e+00, -2.419e+00],
[-3.142e+00, -3.142e+00, ..., 0.000e+00, 0.000e+00]],
dtype=float32)
"""
mag = np.abs(D)
# Prevent NaNs and return magnitude 0, phase 1+0j for zero
zeros_to_ones = mag == 0
mag_nonzero = mag + zeros_to_ones
# Compute real and imaginary separately, because complex division can
# produce NaNs when denormalized numbers are involved (< ~2e-39 for
# complex64, ~5e-309 for complex128)
phase = np.empty_like(D, dtype=util.dtype_r2c(D.dtype))
phase.real = D.real / mag_nonzero + zeros_to_ones
phase.imag = D.imag / mag_nonzero
mag **= power
return mag, phase
def phase_vocoder(
D: np.ndarray,
*,
rate: float,
hop_length: Optional[int] = None,
n_fft: Optional[int] = None,
) -> np.ndarray:
"""Phase vocoder. Given an STFT matrix D, speed up by a factor of ``rate``
Based on the implementation provided by [#]_.
This is a simplified implementation, intended primarily for
reference and pedagogical purposes. It makes no attempt to
handle transients, and is likely to produce many audible
artifacts. For a higher quality implementation, we recommend
the RubberBand library [#]_ and its Python wrapper `pyrubberband`.
.. [#] Ellis, D. P. W. "A phase vocoder in Matlab."
Columbia University, 2002.
http://www.ee.columbia.edu/~dpwe/resources/matlab/pvoc/
.. [#] https://breakfastquay.com/rubberband/
Examples
--------
>>> # Play at double speed
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> D = librosa.stft(y, n_fft=2048, hop_length=512)
>>> D_fast = librosa.phase_vocoder(D, rate=2.0, hop_length=512)
>>> y_fast = librosa.istft(D_fast, hop_length=512)
>>> # Or play at 1/3 speed
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> D = librosa.stft(y, n_fft=2048, hop_length=512)
>>> D_slow = librosa.phase_vocoder(D, rate=1./3, hop_length=512)
>>> y_slow = librosa.istft(D_slow, hop_length=512)
Parameters
----------
D : np.ndarray [shape=(..., d, t), dtype=complex]
STFT matrix
rate : float > 0 [scalar]
Speed-up factor: ``rate > 1`` is faster, ``rate < 1`` is slower.
hop_length : int > 0 [scalar] or None
The number of samples between successive columns of ``D``.
If None, defaults to ``n_fft//4 = (D.shape[0]-1)//2``
n_fft : int > 0 or None
The number of samples per frame in D.
By default (None), this will be inferred from the shape of D.
However, if D was constructed using an odd-length window, the correct
frame length can be specified here.
Returns
-------
D_stretched : np.ndarray [shape=(..., d, t / rate), dtype=complex]
time-stretched STFT
See Also
--------
pyrubberband
"""
if n_fft is None:
n_fft = 2 * (D.shape[-2] - 1)
if hop_length is None:
hop_length = int(n_fft // 4)
time_steps = np.arange(0, D.shape[-1], rate, dtype=np.float64)
# Create an empty output array
shape = list(D.shape)
shape[-1] = len(time_steps)
d_stretch = np.zeros_like(D, shape=shape)
# Expected phase advance in each bin
phi_advance = np.linspace(0, np.pi * hop_length, D.shape[-2])
# Phase accumulator; initialize to the first sample
phase_acc = np.angle(D[..., 0])
# Pad 0 columns to simplify boundary logic
padding = [(0, 0) for _ in D.shape]
padding[-1] = (0, 2)
D = np.pad(D, padding, mode="constant")
for t, step in enumerate(time_steps):
columns = D[..., int(step) : int(step + 2)]
# Weighting for linear magnitude interpolation
alpha = np.mod(step, 1.0)
mag = (1.0 - alpha) * np.abs(columns[..., 0]) + alpha * np.abs(columns[..., 1])
# Store to output array
d_stretch[..., t] = util.phasor(phase_acc, mag=mag)
# Compute phase advance
dphase = np.angle(columns[..., 1]) - np.angle(columns[..., 0]) - phi_advance
# Wrap to -pi:pi range
dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi))
# Accumulate phase
phase_acc += phi_advance + dphase
return d_stretch
@cache(level=20)
def iirt(
y: np.ndarray,
*,
sr: float = 22050,
win_length: int = 2048,
hop_length: Optional[int] = None,
center: bool = True,
tuning: float = 0.0,
pad_mode: _PadMode = "constant",
flayout: str = "sos",
res_type: str = "soxr_hq",
**kwargs: Any,
) -> np.ndarray:
r"""Time-frequency representation using IIR filters
This function will return a time-frequency representation
using a multirate filter bank consisting of IIR filters. [#]_
First, ``y`` is resampled as needed according to the provided ``sample_rates``.
Then, a filterbank with with ``n`` band-pass filters is designed.
The resampled input signals are processed by the filterbank as a whole.
(`scipy.signal.filtfilt` resp. `sosfiltfilt` is used to make the phase linear.)
The output of the filterbank is cut into frames.
For each band, the short-time mean-square power (STMSP) is calculated by
summing ``win_length`` subsequent filtered time samples.
When called with the default set of parameters, it will generate the TF-representation
(pitch filterbank):
* 85 filters with MIDI pitches [24, 108] as ``center_freqs``.
* each filter having a bandwidth of one semitone.
.. [#] Müller, Meinard.
"Information Retrieval for Music and Motion."
Springer Verlag. 2007.
Parameters
----------
y : np.ndarray [shape=(..., n)]
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
win_length : int > 0, <= n_fft
Window length.
hop_length : int > 0 [scalar]
Hop length, number samples between subsequent frames.
If not supplied, defaults to ``win_length // 4``.
center : boolean
- If ``True``, the signal ``y`` is padded so that frame
``D[..., :, t]`` is centered at ``y[t * hop_length]``.
- If ``False``, then `D[..., :, t]`` begins at ``y[t * hop_length]``
tuning : float [scalar]
Tuning deviation from A440 in fractions of a bin.
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, this function uses zero padding.
flayout : string
- If `sos` (default), a series of second-order filters is used for filtering with `scipy.signal.sosfiltfilt`.
Minimizes numerical precision errors for high-order filters, but is slower.
- If `ba`, the standard difference equation is used for filtering with `scipy.signal.filtfilt`.
Can be unstable for high-order filters.
res_type : string
The resampling mode. See `librosa.resample` for details.
**kwargs : additional keyword arguments
Additional arguments for `librosa.filters.semitone_filterbank`
(e.g., could be used to provide another set of ``center_freqs`` and ``sample_rates``).
Returns
-------
bands_power : np.ndarray [shape=(..., n, t), dtype=dtype]
Short-time mean-square power for the input signal.
Raises
------
ParameterError
If ``flayout`` is not None, `ba`, or `sos`.
See Also
--------
librosa.filters.semitone_filterbank
librosa.filters.mr_frequencies
librosa.cqt
scipy.signal.filtfilt
scipy.signal.sosfiltfilt
Examples
--------
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3)
>>> D = np.abs(librosa.iirt(y))
>>> C = np.abs(librosa.cqt(y=y, sr=sr))
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
... y_axis='cqt_hz', x_axis='time', ax=ax[0])
>>> ax[0].set(title='Constant-Q transform')
>>> ax[0].label_outer()
>>> img = librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max),
... y_axis='cqt_hz', x_axis='time', ax=ax[1])
>>> ax[1].set_title('Semitone spectrogram (iirt)')
>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
"""
if flayout not in ("ba", "sos"):
raise ParameterError(f"Unsupported flayout={flayout}")
# check audio input
util.valid_audio(y, mono=False)
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = win_length // 4
# Pad the time series so that frames are centered
if center:
padding = [(0, 0) for _ in y.shape]
padding[-1] = (win_length // 2, win_length // 2)
y = np.pad(y, padding, mode=pad_mode)
# get the semitone filterbank
filterbank_ct, sample_rates = semitone_filterbank(
tuning=tuning, flayout=flayout, **kwargs
)
# create three downsampled versions of the audio signal
y_resampled = []
y_srs = np.unique(sample_rates)
for cur_sr in y_srs:
y_resampled.append(resample(y, orig_sr=sr, target_sr=cur_sr, res_type=res_type))
# Compute the number of frames that will fit. The end may get truncated.
n_frames = int(1 + (y.shape[-1] - win_length) // hop_length)
# Pre-allocate the output array
shape = list(y.shape)
# Time dimension reduces to n_frames
shape[-1] = n_frames
# Insert a new axis at position -2 for filter response
shape.insert(-1, len(filterbank_ct))
bands_power = np.empty_like(y, shape=shape)
slices: List[Union[int, slice]] = [slice(None) for _ in bands_power.shape]
for i, (cur_sr, cur_filter) in enumerate(zip(sample_rates, filterbank_ct)):
slices[-2] = i
# filter the signal
cur_sr_idx = np.flatnonzero(y_srs == cur_sr)[0]
if flayout == "ba":
cur_filter_output = scipy.signal.filtfilt(
cur_filter[0], cur_filter[1], y_resampled[cur_sr_idx], axis=-1
)
elif flayout == "sos":
cur_filter_output = scipy.signal.sosfiltfilt(
cur_filter, y_resampled[cur_sr_idx], axis=-1
)
factor = sr / cur_sr
hop_length_STMSP = hop_length / factor
win_length_STMSP_round = int(round(win_length / factor))
# hop_length_STMSP is used here as a floating-point number.
# The discretization happens at the end to avoid accumulated rounding errors.
start_idx = np.arange(
0, cur_filter_output.shape[-1] - win_length_STMSP_round, hop_length_STMSP
)
if len(start_idx) < n_frames:
min_length = (
int(np.ceil(n_frames * hop_length_STMSP)) + win_length_STMSP_round
)
cur_filter_output = util.fix_length(cur_filter_output, size=min_length)
start_idx = np.arange(
0,
cur_filter_output.shape[-1] - win_length_STMSP_round,
hop_length_STMSP,
)
start_idx = np.round(start_idx).astype(int)[:n_frames]
idx = np.add.outer(start_idx, np.arange(win_length_STMSP_round))
bands_power[tuple(slices)] = factor * np.sum(
cur_filter_output[..., idx] ** 2, axis=-1
)
return bands_power
@cache(level=30)
def power_to_db(
S: np.ndarray,
*,
ref: Union[float, Callable] = 1.0,
amin: float = 1e-10,
top_db: Optional[float] = 80.0,
) -> np.ndarray:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units
This computes the scaling ``10 * log10(S / ref)`` in a numerically
stable way.
Parameters
----------
S : np.ndarray
input power
ref : scalar or callable
If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``::
10 * log10(S / ref)
Zeros in the output correspond to positions where ``S == ref``.
If callable, the reference value is computed as ``ref(S)``.
amin : float > 0 [scalar]
minimum threshold for ``abs(S)`` and ``ref``
top_db : float >= 0 [scalar]
threshold the output at ``top_db`` below the peak:
``max(10 * log10(S/ref)) - top_db``
Returns
-------
S_db : np.ndarray
``S_db ~= 10 * log10(S) - 10 * log10(ref)``
See Also
--------
perceptual_weighting
db_to_power
amplitude_to_db
db_to_amplitude
Notes
-----
This function caches at level 30.
Examples
--------
Get a power spectrogram from a waveform ``y``
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> S = np.abs(librosa.stft(y))
>>> librosa.power_to_db(S**2)
array([[-41.809, -41.809, ..., -41.809, -41.809],
[-41.809, -41.809, ..., -41.809, -41.809],
...,
[-41.809, -41.809, ..., -41.809, -41.809],
[-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32)
Compute dB relative to peak power
>>> librosa.power_to_db(S**2, ref=np.max)
array([[-80., -80., ..., -80., -80.],
[-80., -80., ..., -80., -80.],
...,
[-80., -80., ..., -80., -80.],
[-80., -80., ..., -80., -80.]], dtype=float32)
Or compare to median power
>>> librosa.power_to_db(S**2, ref=np.median)
array([[16.578, 16.578, ..., 16.578, 16.578],
[16.578, 16.578, ..., 16.578, 16.578],
...,
[16.578, 16.578, ..., 16.578, 16.578],
[16.578, 16.578, ..., 16.578, 16.578]], dtype=float32)
And plot the results
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time',
... ax=ax[0])
>>> ax[0].set(title='Power spectrogram')
>>> ax[0].label_outer()
>>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max),
... sr=sr, y_axis='log', x_axis='time', ax=ax[1])
>>> ax[1].set(title='Log-Power spectrogram')
>>> fig.colorbar(imgpow, ax=ax[0])
>>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB")
"""
S = np.asarray(S)
if amin <= 0:
raise ParameterError("amin must be strictly positive")
if np.issubdtype(S.dtype, np.complexfloating):
warnings.warn(
"power_to_db was called on complex input so phase "
"information will be discarded. To suppress this warning, "
"call power_to_db(np.abs(D)**2) instead.",
stacklevel=2,
)
magnitude = np.abs(S)
else:
magnitude = S
if callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
log_spec: np.ndarray = 10.0 * np.log10(np.maximum(amin, magnitude))
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
if top_db is not None:
if top_db < 0:
raise ParameterError("top_db must be non-negative")
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
return log_spec
@cache(level=30)
def db_to_power(S_db: np.ndarray, *, ref: float = 1.0) -> np.ndarray:
"""Convert a dB-scale spectrogram to a power spectrogram.
This effectively inverts ``power_to_db``::
db_to_power(S_db) ~= ref * 10.0**(S_db / 10)
Parameters
----------
S_db : np.ndarray
dB-scaled spectrogram
ref : number > 0
Reference power: output will be scaled by this value
Returns
-------
S : np.ndarray
Power spectrogram
Notes
-----
This function caches at level 30.
"""
return ref * np.power(10.0, 0.1 * S_db)
@cache(level=30)
def amplitude_to_db(
S: np.ndarray,
*,
ref: Union[float, Callable] = 1.0,
amin: float = 1e-5,
top_db: Optional[float] = 80.0,
) -> np.ndarray:
"""Convert an amplitude spectrogram to dB-scaled spectrogram.
This is equivalent to ``power_to_db(S**2, ref=ref**2, amin=amin**2, top_db=top_db)``,
but is provided for convenience.
Parameters
----------
S : np.ndarray
input amplitude
ref : scalar or callable
If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``:
``20 * log10(S / ref)``.
Zeros in the output correspond to positions where ``S == ref``.
If callable, the reference value is computed as ``ref(S)``.
amin : float > 0 [scalar]
minimum threshold for ``S`` and ``ref``
top_db : float >= 0 [scalar]
threshold the output at ``top_db`` below the peak:
``max(20 * log10(S/ref)) - top_db``
Returns
-------
S_db : np.ndarray
``S`` measured in dB
See Also
--------
power_to_db, db_to_amplitude
Notes
-----
This function caches at level 30.
"""
S = np.asarray(S)
if np.issubdtype(S.dtype, np.complexfloating):
warnings.warn(
"amplitude_to_db was called on complex input so phase "
"information will be discarded. To suppress this warning, "
"call amplitude_to_db(np.abs(S)) instead.",
stacklevel=2,
)
magnitude = np.abs(S)
if callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
power = np.square(magnitude, out=magnitude)
return power_to_db(power, ref=ref_value**2, amin=amin**2, top_db=top_db)
@cache(level=30)
def db_to_amplitude(S_db: np.ndarray, *, ref: float = 1.0) -> np.ndarray:
"""Convert a dB-scaled spectrogram to an amplitude spectrogram.
This effectively inverts `amplitude_to_db`::
db_to_amplitude(S_db) ~= 10.0**(0.5 * S_db/10 + log10(ref))
Parameters
----------
S_db : np.ndarray
dB-scaled spectrogram
ref : number > 0
Optional reference power.
Returns
-------
S : np.ndarray
Linear magnitude spectrogram
Notes
-----
This function caches at level 30.
"""
return db_to_power(S_db, ref=ref**2) ** 0.5
@cache(level=30)
def perceptual_weighting(
S: np.ndarray, frequencies: np.ndarray, *, kind: str = "A", **kwargs: Any
) -> np.ndarray:
"""Perceptual weighting of a power spectrogram::
S_p[..., f, :] = frequency_weighting(f, 'A') + 10*log(S[..., f, :] / ref)
Parameters
----------
S : np.ndarray [shape=(..., d, t)]
Power spectrogram
frequencies : np.ndarray [shape=(d,)]
Center frequency for each row of` `S``
kind : str
The frequency weighting curve to use.
e.g. `'A'`, `'B'`, `'C'`, `'D'`, `None or 'Z'`
**kwargs : additional keyword arguments
Additional keyword arguments to `power_to_db`.
Returns
-------
S_p : np.ndarray [shape=(..., d, t)]
perceptually weighted version of ``S``
See Also
--------
power_to_db
Notes
-----
This function caches at level 30.
Examples
--------
Re-weight a CQT power spectrum, using peak power as reference
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('A1')))
>>> freqs = librosa.cqt_frequencies(C.shape[0],
... fmin=librosa.note_to_hz('A1'))
>>> perceptual_CQT = librosa.perceptual_weighting(C**2,
... freqs,
... ref=np.max)
>>> perceptual_CQT
array([[ -96.528, -97.101, ..., -108.561, -108.561],
[ -95.88 , -96.479, ..., -107.551, -107.551],
...,
[ -65.142, -53.256, ..., -80.098, -80.098],
[ -71.542, -53.197, ..., -80.311, -80.311]])
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> img = librosa.display.specshow(librosa.amplitude_to_db(C,
... ref=np.max),
... fmin=librosa.note_to_hz('A1'),
... y_axis='cqt_hz', x_axis='time',
... ax=ax[0])
>>> ax[0].set(title='Log CQT power')
>>> ax[0].label_outer()
>>> imgp = librosa.display.specshow(perceptual_CQT, y_axis='cqt_hz',
... fmin=librosa.note_to_hz('A1'),
... x_axis='time', ax=ax[1])
>>> ax[1].set(title='Perceptually weighted log CQT')
>>> fig.colorbar(img, ax=ax[0], format="%+2.0f dB")
>>> fig.colorbar(imgp, ax=ax[1], format="%+2.0f dB")
"""
offset = convert.frequency_weighting(frequencies, kind=kind).reshape((-1, 1))
result: np.ndarray = offset + power_to_db(S, **kwargs)
return result
@cache(level=30)
def fmt(
y: np.ndarray,
*,
t_min: float = 0.5,
n_fmt: Optional[int] = None,
kind: str = "cubic",
beta: float = 0.5,
over_sample: float = 1,
axis: int = -1,
) -> np.ndarray:
"""The fast Mellin transform (FMT)
The Mellin of a signal `y` is performed by interpolating `y` on an exponential time
axis, applying a polynomial window, and then taking the discrete Fourier transform.
When the Mellin parameter (beta) is 1/2, it is also known as the scale transform. [#]_
The scale transform can be useful for audio analysis because its magnitude is invariant
to scaling of the domain (e.g., time stretching or compression). This is analogous
to the magnitude of the Fourier transform being invariant to shifts in the input domain.
.. [#] De Sena, Antonio, and Davide Rocchesso.
"A fast Mellin and scale transform."
EURASIP Journal on Applied Signal Processing 2007.1 (2007): 75-75.
.. [#] Cohen, L.
"The scale representation."
IEEE Transactions on Signal Processing 41, no. 12 (1993): 3275-3292.
Parameters
----------
y : np.ndarray, real-valued
The input signal(s). Can be multidimensional.
The target axis must contain at least 3 samples.
t_min : float > 0
The minimum time spacing (in samples).
This value should generally be less than 1 to preserve as much information as
possible.
n_fmt : int > 2 or None
The number of scale transform bins to use.
If None, then ``n_bins = over_sample * ceil(n * log((n-1)/t_min))`` is taken,
where ``n = y.shape[axis]``
kind : str
The type of interpolation to use when re-sampling the input.
See `scipy.interpolate.interp1d` for possible values.
Note that the default is to use high-precision (cubic) interpolation.
This can be slow in practice; if speed is preferred over accuracy,
then consider using ``kind='linear'``.
beta : float
The Mellin parameter. ``beta=0.5`` provides the scale transform.
over_sample : float >= 1
Over-sampling factor for exponential resampling.
axis : int
The axis along which to transform ``y``
Returns
-------
x_scale : np.ndarray [dtype=complex]
The scale transform of ``y`` along the ``axis`` dimension.
Raises
------
ParameterError
if ``n_fmt < 2`` or ``t_min <= 0``
or if ``y`` is not finite
or if ``y.shape[axis] < 3``.
Notes
-----
This function caches at level 30.
Examples
--------
>>> # Generate a signal and time-stretch it (with energy normalization)
>>> scale = 1.25
>>> freq = 3.0
>>> x1 = np.linspace(0, 1, num=1024, endpoint=False)
>>> x2 = np.linspace(0, 1, num=int(scale * len(x1)), endpoint=False)
>>> y1 = np.sin(2 * np.pi * freq * x1)
>>> y2 = np.sin(2 * np.pi * freq * x2) / np.sqrt(scale)
>>> # Verify that the two signals have the same energy
>>> np.sum(np.abs(y1)**2), np.sum(np.abs(y2)**2)
(255.99999999999997, 255.99999999999969)
>>> scale1 = librosa.fmt(y1, n_fmt=512)
>>> scale2 = librosa.fmt(y2, n_fmt=512)
>>> # And plot the results
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=2)
>>> ax[0].plot(y1, label='Original')
>>> ax[0].plot(y2, linestyle='--', label='Stretched')
>>> ax[0].set(xlabel='time (samples)', title='Input signals')
>>> ax[0].legend()
>>> ax[1].semilogy(np.abs(scale1), label='Original')
>>> ax[1].semilogy(np.abs(scale2), linestyle='--', label='Stretched')
>>> ax[1].set(xlabel='scale coefficients', title='Scale transform magnitude')
>>> ax[1].legend()
>>> # Plot the scale transform of an onset strength autocorrelation
>>> y, sr = librosa.load(librosa.ex('choice'))
>>> odf = librosa.onset.onset_strength(y=y, sr=sr)
>>> # Auto-correlate with up to 10 seconds lag
>>> odf_ac = librosa.autocorrelate(odf, max_size=10 * sr // 512)
>>> # Normalize
>>> odf_ac = librosa.util.normalize(odf_ac, norm=np.inf)
>>> # Compute the scale transform
>>> odf_ac_scale = librosa.fmt(librosa.util.normalize(odf_ac), n_fmt=512)
>>> # Plot the results
>>> fig, ax = plt.subplots(nrows=3)
>>> ax[0].plot(odf, label='Onset strength')
>>> ax[0].set(xlabel='Time (frames)', title='Onset strength')
>>> ax[1].plot(odf_ac, label='Onset autocorrelation')
>>> ax[1].set(xlabel='Lag (frames)', title='Onset autocorrelation')
>>> ax[2].semilogy(np.abs(odf_ac_scale), label='Scale transform magnitude')
>>> ax[2].set(xlabel='scale coefficients')
"""
n = y.shape[axis]
if n < 3:
raise ParameterError(f"y.shape[{axis}]=={n} < 3")
if t_min <= 0:
raise ParameterError(f"t_min={t_min} must be a positive number")
if n_fmt is None:
if over_sample < 1:
raise ParameterError(f"over_sample={over_sample} must be >= 1")
# The base is the maximum ratio between adjacent samples
# Since the sample spacing is increasing, this is simply the
# ratio between the positions of the last two samples: (n-1)/(n-2)
log_base = np.log(n - 1) - np.log(n - 2)
n_fmt = int(np.ceil(over_sample * (np.log(n - 1) - np.log(t_min)) / log_base))
elif n_fmt < 3:
raise ParameterError(f"n_fmt=={n_fmt} < 3")
else:
log_base = (np.log(n_fmt - 1) - np.log(n_fmt - 2)) / over_sample
if not np.all(np.isfinite(y)):
raise ParameterError("y must be finite everywhere")
base = np.exp(log_base)
# original grid: signal covers [0, 1). This range is arbitrary, but convenient.
# The final sample is positioned at (n-1)/n, so we omit the endpoint
x = np.linspace(0, 1, num=n, endpoint=False)
# build the interpolator
f_interp = scipy.interpolate.interp1d(x, y, kind=kind, axis=axis)
# build the new sampling grid
# exponentially spaced between t_min/n and 1 (exclusive)
# we'll go one past where we need, and drop the last sample
# When over-sampling, the last input sample contributions n_over samples.
# To keep the spacing consistent, we over-sample by n_over, and then
# trim the final samples.
n_over = int(np.ceil(over_sample))
x_exp = np.logspace(
(np.log(t_min) - np.log(n)) / log_base,
0,
num=n_fmt + n_over,
endpoint=False,
base=base,
)[:-n_over]
# Clean up any rounding errors at the boundaries of the interpolation
# The interpolator gets angry if we try to extrapolate, so clipping is necessary here.
if x_exp[0] < t_min or x_exp[-1] > float(n - 1.0) / n:
x_exp = np.clip(x_exp, float(t_min) / n, x[-1])
# Make sure that all sample points are unique
# This should never happen!
if len(np.unique(x_exp)) != len(x_exp):
raise ParameterError("Redundant sample positions in Mellin transform")
# Resample the signal
y_res = f_interp(x_exp)
# Broadcast the window correctly
shape = [1] * y_res.ndim
shape[axis] = -1
# Apply the window and fft
# Normalization is absorbed into the window here for expedience
fft = get_fftlib()
result: np.ndarray = fft.rfft(
y_res * ((x_exp**beta).reshape(shape) * np.sqrt(n) / n_fmt), axis=axis
)
return result
@overload
def pcen(
S: np.ndarray,
*,
sr: float = ...,
hop_length: int = ...,
gain: float = ...,
bias: float = ...,
power: float = ...,
time_constant: float = ...,
eps: float = ...,
b: Optional[float] = ...,
max_size: int = ...,
ref: Optional[np.ndarray] = ...,
axis: int = ...,
max_axis: Optional[int] = ...,
zi: Optional[np.ndarray] = ...,
return_zf: Literal[False] = ...,
) -> np.ndarray:
...
@overload
def pcen(
S: np.ndarray,
*,
sr: float = ...,
hop_length: int = ...,
gain: float = ...,
bias: float = ...,
power: float = ...,
time_constant: float = ...,
eps: float = ...,
b: Optional[float] = ...,
max_size: int = ...,
ref: Optional[np.ndarray] = ...,
axis: int = ...,
max_axis: Optional[int] = ...,
zi: Optional[np.ndarray] = ...,
return_zf: Literal[True],
) -> Tuple[np.ndarray, np.ndarray]:
...
@overload
def pcen(
S: np.ndarray,
*,
sr: float = ...,
hop_length: int = ...,
gain: float = ...,
bias: float = ...,
power: float = ...,
time_constant: float = ...,
eps: float = ...,
b: Optional[float] = ...,
max_size: int = ...,
ref: Optional[np.ndarray] = ...,
axis: int = ...,
max_axis: Optional[int] = ...,
zi: Optional[np.ndarray] = ...,
return_zf: bool = ...,
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
...
@cache(level=30)
def pcen(
S: np.ndarray,
*,
sr: float = 22050,
hop_length: int = 512,
gain: float = 0.98,
bias: float = 2,
power: float = 0.5,
time_constant: float = 0.400,
eps: float = 1e-6,
b: Optional[float] = None,
max_size: int = 1,
ref: Optional[np.ndarray] = None,
axis: int = -1,
max_axis: Optional[int] = None,
zi: Optional[np.ndarray] = None,
return_zf: bool = False,
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
"""Per-channel energy normalization (PCEN)
This function normalizes a time-frequency representation ``S`` by
performing automatic gain control, followed by nonlinear compression [#]_ ::
P[f, t] = (S / (eps + M[f, t])**gain + bias)**power - bias**power
IMPORTANT: the default values of eps, gain, bias, and power match the
original publication, in which ``S`` is a 40-band mel-frequency
spectrogram with 25 ms windowing, 10 ms frame shift, and raw audio values
in the interval [-2**31; 2**31-1[. If you use these default values, we
recommend to make sure that the raw audio is properly scaled to this
interval, and not to [-1, 1[ as is most often the case.
The matrix ``M`` is the result of applying a low-pass, temporal IIR filter
to ``S``::
M[f, t] = (1 - b) * M[f, t - 1] + b * S[f, t]
If ``b`` is not provided, it is calculated as::
b = (sqrt(1 + 4* T**2) - 1) / (2 * T**2)
where ``T = time_constant * sr / hop_length``. [#]_
This normalization is designed to suppress background noise and
emphasize foreground signals, and can be used as an alternative to
decibel scaling (`amplitude_to_db`).
This implementation also supports smoothing across frequency bins
by specifying ``max_size > 1``. If this option is used, the filtered
spectrogram ``M`` is computed as::
M[f, t] = (1 - b) * M[f, t - 1] + b * R[f, t]
where ``R`` has been max-filtered along the frequency axis, similar to
the SuperFlux algorithm implemented in `onset.onset_strength`::
R[f, t] = max(S[f - max_size//2: f + max_size//2, t])
This can be used to perform automatic gain control on signals that cross
or span multiple frequency bans, which may be desirable for spectrograms
with high frequency resolution.
.. [#] Wang, Y., Getreuer, P., Hughes, T., Lyon, R. F., & Saurous, R. A.
(2017, March). Trainable frontend for robust and far-field keyword spotting.
In Acoustics, Speech and Signal Processing (ICASSP), 2017
IEEE International Conference on (pp. 5670-5674). IEEE.
.. [#] Lostanlen, V., Salamon, J., McFee, B., Cartwright, M., Farnsworth, A.,
Kelling, S., and Bello, J. P. Per-Channel Energy Normalization: Why and How.
IEEE Signal Processing Letters, 26(1), 39-43.
Parameters
----------
S : np.ndarray (non-negative)
The input (magnitude) spectrogram
sr : number > 0 [scalar]
The audio sampling rate
hop_length : int > 0 [scalar]
The hop length of ``S``, expressed in samples
gain : number >= 0 [scalar]
The gain factor. Typical values should be slightly less than 1.
bias : number >= 0 [scalar]
The bias point of the nonlinear compression (default: 2)
power : number >= 0 [scalar]
The compression exponent. Typical values should be between 0 and 0.5.
Smaller values of ``power`` result in stronger compression.
At the limit ``power=0``, polynomial compression becomes logarithmic.
time_constant : number > 0 [scalar]
The time constant for IIR filtering, measured in seconds.
eps : number > 0 [scalar]
A small constant used to ensure numerical stability of the filter.
b : number in [0, 1] [scalar]
The filter coefficient for the low-pass filter.
If not provided, it will be inferred from ``time_constant``.
max_size : int > 0 [scalar]
The width of the max filter applied to the frequency axis.
If left as `1`, no filtering is performed.
ref : None or np.ndarray (shape=S.shape)
An optional pre-computed reference spectrum (``R`` in the above).
If not provided it will be computed from ``S``.
axis : int [scalar]
The (time) axis of the input spectrogram.
max_axis : None or int [scalar]
The frequency axis of the input spectrogram.
If `None`, and ``S`` is two-dimensional, it will be inferred
as the opposite from ``axis``.
If ``S`` is not two-dimensional, and ``max_size > 1``, an error
will be raised.
zi : np.ndarray
The initial filter delay values.
This may be the ``zf`` (final delay values) of a previous call to ``pcen``, or
computed by `scipy.signal.lfilter_zi`.
return_zf : bool
If ``True``, return the final filter delay values along with the PCEN output ``P``.
This is primarily useful in streaming contexts, where the final state of one
block of processing should be used to initialize the next block.
If ``False`` (default) only the PCEN values ``P`` are returned.
Returns
-------
P : np.ndarray, non-negative [shape=(n, m)]
The per-channel energy normalized version of ``S``.
zf : np.ndarray (optional)
The final filter delay values. Only returned if ``return_zf=True``.
See Also
--------
amplitude_to_db
librosa.onset.onset_strength
Examples
--------
Compare PCEN to log amplitude (dB) scaling on Mel spectra
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('robin'))
>>> # We recommend scaling y to the range [-2**31, 2**31[ before applying
>>> # PCEN's default parameters. Furthermore, we use power=1 to get a
>>> # magnitude spectrum instead of a power spectrum.
>>> S = librosa.feature.melspectrogram(y=y, sr=sr, power=1)
>>> log_S = librosa.amplitude_to_db(S, ref=np.max)
>>> pcen_S = librosa.pcen(S * (2**31))
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> img = librosa.display.specshow(log_S, x_axis='time', y_axis='mel', ax=ax[0])
>>> ax[0].set(title='log amplitude (dB)', xlabel=None)
>>> ax[0].label_outer()
>>> imgpcen = librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel', ax=ax[1])
>>> ax[1].set(title='Per-channel energy normalization')
>>> fig.colorbar(img, ax=ax[0], format="%+2.0f dB")
>>> fig.colorbar(imgpcen, ax=ax[1])
Compare PCEN with and without max-filtering
>>> pcen_max = librosa.pcen(S * (2**31), max_size=3)
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel', ax=ax[0])
>>> ax[0].set(title='Per-channel energy normalization (no max-filter)')
>>> ax[0].label_outer()
>>> img = librosa.display.specshow(pcen_max, x_axis='time', y_axis='mel', ax=ax[1])
>>> ax[1].set(title='Per-channel energy normalization (max_size=3)')
>>> fig.colorbar(img, ax=ax)
"""
if power < 0:
raise ParameterError(f"power={power} must be nonnegative")
if gain < 0:
raise ParameterError(f"gain={gain} must be non-negative")
if bias < 0:
raise ParameterError(f"bias={bias} must be non-negative")
if eps <= 0:
raise ParameterError(f"eps={eps} must be strictly positive")
if time_constant <= 0:
raise ParameterError(f"time_constant={time_constant} must be strictly positive")
if not util.is_positive_int(max_size):
raise ParameterError(f"max_size={max_size} must be a positive integer")
if b is None:
t_frames = time_constant * sr / float(hop_length)
# By default, this solves the equation for b:
# b**2 + (1 - b) / t_frames - 2 = 0
# which approximates the full-width half-max of the
# squared frequency response of the IIR low-pass filter
b = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)
if not 0 <= b <= 1:
raise ParameterError(f"b={b} must be between 0 and 1")
if np.issubdtype(S.dtype, np.complexfloating):
warnings.warn(
"pcen was called on complex input so phase "
"information will be discarded. To suppress this warning, "
"call pcen(np.abs(D)) instead.",
stacklevel=2,
)
S = np.abs(S)
if ref is None:
if max_size == 1:
ref = S
elif S.ndim == 1:
raise ParameterError(
"Max-filtering cannot be applied to 1-dimensional input"
)
else:
if max_axis is None:
if S.ndim != 2:
raise ParameterError(
f"Max-filtering a {S.ndim:d}-dimensional spectrogram "
"requires you to specify max_axis"
)
# if axis = 0, max_axis=1
# if axis = +- 1, max_axis = 0
max_axis = np.mod(1 - axis, 2)
ref = scipy.ndimage.maximum_filter1d(S, max_size, axis=max_axis)
if zi is None:
# Make sure zi matches dimension to input
shape = tuple([1] * ref.ndim)
zi = np.empty(shape)
zi[:] = scipy.signal.lfilter_zi([b], [1, b - 1])[:]
# Temporal integration
S_smooth: np.ndarray
zf: np.ndarray
S_smooth, zf = scipy.signal.lfilter([b], [1, b - 1], ref, zi=zi, axis=axis)
# Adaptive gain control
# Working in log-space gives us some stability, and a slight speedup
smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps)))
# Dynamic range compression
S_out: np.ndarray
if power == 0:
S_out = np.log1p(S * smooth)
elif bias == 0:
S_out = np.exp(power * (np.log(S) + np.log(smooth)))
else:
S_out = (bias**power) * np.expm1(power * np.log1p(S * smooth / bias))
if return_zf:
return S_out, zf
else:
return S_out
def griffinlim(
S: np.ndarray,
*,
n_iter: int = 32,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
n_fft: Optional[int] = None,
window: _WindowSpec = "hann",
center: bool = True,
dtype: Optional[DTypeLike] = None,
length: Optional[int] = None,
pad_mode: _PadModeSTFT = "constant",
momentum: float = 0.99,
init: Optional[str] = "random",
random_state: Optional[
Union[int, np.random.RandomState, np.random.Generator]
] = None,
) -> np.ndarray:
"""Approximate magnitude spectrogram inversion using the "fast" Griffin-Lim algorithm.
Given a short-time Fourier transform magnitude matrix (``S``), the algorithm randomly
initializes phase estimates, and then alternates forward- and inverse-STFT
operations. [#]_
Note that this assumes reconstruction of a real-valued time-domain signal, and
that ``S`` contains only the non-negative frequencies (as computed by
`stft`).
The "fast" GL method [#]_ uses a momentum parameter to accelerate convergence.
.. [#] D. W. Griffin and J. S. Lim,
"Signal estimation from modified short-time Fourier transform,"
IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
.. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L.
"A fast Griffin-Lim algorithm,"
IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
Oct. 2013.
Parameters
----------
S : np.ndarray [shape=(..., n_fft // 2 + 1, t), non-negative]
An array of short-time Fourier transform magnitudes as produced by
`stft`.
n_iter : int > 0
The number of iterations to run
hop_length : None or int > 0
The hop length of the STFT. If not provided, it will default to ``n_fft // 4``
win_length : None or int > 0
The window length of the STFT. By default, it will equal ``n_fft``
n_fft : None or int > 0
The number of samples per frame.
By default, this will be inferred from the shape of ``S`` as an even number.
However, if an odd frame length was used, you can explicitly set ``n_fft``.
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
A window specification as supported by `stft` or `istft`
center : boolean
If ``True``, the STFT is assumed to use centered frames.
If ``False``, the STFT is assumed to use left-aligned frames.
dtype : np.dtype
Real numeric type for the time-domain signal. Default is inferred
to match the precision of the input spectrogram.
length : None or int > 0
If provided, the output ``y`` is zero-padded or clipped to exactly ``length``
samples.
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, STFT uses zero padding.
momentum : number >= 0
The momentum parameter for fast Griffin-Lim.
Setting this to 0 recovers the original Griffin-Lim method [1]_.
Values near 1 can lead to faster convergence, but above 1 may not converge.
init : None or 'random' [default]
If 'random' (the default), then phase values are initialized randomly
according to ``random_state``. This is recommended when the input ``S`` is
a magnitude spectrogram with no initial phase estimates.
If `None`, then the phase is initialized from ``S``. This is useful when
an initial guess for phase can be provided, or when you want to resume
Griffin-Lim from a previous output.
random_state : None, int, np.random.RandomState, or np.random.Generator
If int, random_state is the seed used by the random number generator
for phase initialization.
If `np.random.RandomState` or `np.random.Generator` instance, the random number
generator itself.
If `None`, defaults to the `np.random.default_rng()` object.
Returns
-------
y : np.ndarray [shape=(..., n)]
time-domain signal reconstructed from ``S``
See Also
--------
stft
istft
magphase
filters.get_window
Examples
--------
A basic STFT inverse example
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> # Get the magnitude spectrogram
>>> S = np.abs(librosa.stft(y))
>>> # Invert using Griffin-Lim
>>> y_inv = librosa.griffinlim(S)
>>> # Invert without estimating phase
>>> y_istft = librosa.istft(S)
Wave-plot the results
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
>>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0])
>>> ax[0].set(title='Original', xlabel=None)
>>> ax[0].label_outer()
>>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1])
>>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None)
>>> ax[1].label_outer()
>>> librosa.display.waveshow(y_istft, sr=sr, color='r', ax=ax[2])
>>> ax[2].set_title('Magnitude-only istft reconstruction')
"""
if random_state is None:
rng = np.random.default_rng()
elif isinstance(random_state, int):
rng = np.random.RandomState(seed=random_state) # type: ignore
elif isinstance(random_state, (np.random.RandomState, np.random.Generator)):
rng = random_state # type: ignore
else:
raise ParameterError(f"Unsupported random_state={random_state!r}")
if momentum > 1:
warnings.warn(
f"Griffin-Lim with momentum={momentum} > 1 can be unstable. "
"Proceed with caution!",
stacklevel=2,
)
elif momentum < 0:
raise ParameterError(f"griffinlim() called with momentum={momentum} < 0")
# Infer n_fft from the spectrogram shape
if n_fft is None:
n_fft = 2 * (S.shape[-2] - 1)
# Infer the dtype from S
angles = np.empty(S.shape, dtype=util.dtype_r2c(S.dtype))
eps = util.tiny(angles)
if init == "random":
# randomly initialize the phase
angles[:] = util.phasor((2 * np.pi * rng.random(size=S.shape)))
elif init is None:
# Initialize an all ones complex matrix
angles[:] = 1.0
else:
raise ParameterError(f"init={init} must either None or 'random'")
# Place-holders for temporary data and reconstructed buffer
rebuilt = None
tprev = None
inverse = None
# Absorb magnitudes into angles
angles *= S
for _ in range(n_iter):
# Invert with our current estimate of the phases
inverse = istft(
angles,
hop_length=hop_length,
win_length=win_length,
n_fft=n_fft,
window=window,
center=center,
dtype=dtype,
length=length,
out=inverse,
)
# Rebuild the spectrogram
rebuilt = stft(
inverse,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
center=center,
pad_mode=pad_mode,
out=rebuilt,
)
# Update our phase estimates
angles[:] = rebuilt
if tprev is not None:
angles -= (momentum / (1 + momentum)) * tprev
angles /= np.abs(angles) + eps
angles *= S
# Store
rebuilt, tprev = tprev, rebuilt
# Return the final phase estimates
return istft(
angles,
hop_length=hop_length,
win_length=win_length,
n_fft=n_fft,
window=window,
center=center,
dtype=dtype,
length=length,
out=inverse,
)
def _spectrogram(
*,
y: Optional[np.ndarray] = None,
S: Optional[np.ndarray] = None,
n_fft: Optional[int] = 2048,
hop_length: Optional[int] = 512,
power: float = 1,
win_length: Optional[int] = None,
window: _WindowSpec = "hann",
center: bool = True,
pad_mode: _PadModeSTFT = "constant",
) -> Tuple[np.ndarray, int]:
"""Helper function to retrieve a magnitude spectrogram.
This is primarily used in feature extraction functions that can operate on
either audio time-series or spectrogram input.
Parameters
----------
y : None or np.ndarray
If provided, an audio time series
S : None or np.ndarray
Spectrogram input, optional
n_fft : int > 0
STFT window size
hop_length : int > 0
STFT hop length
power : float > 0
Exponent for the magnitude spectrogram,
e.g., 1 for energy, 2 for power, etc.
win_length : int <= n_fft [scalar]
Each frame of audio is windowed by ``window``.
The window will be of length ``win_length`` and then padded
with zeros to match ``n_fft``.
If unspecified, defaults to ``win_length = n_fft``.
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
- a window specification (string, tuple, or number);
see `scipy.signal.get_window`
- a window function, such as `scipy.signal.windows.hann`
- a vector or array of length ``n_fft``
.. see also:: `filters.get_window`
center : boolean
- If ``True``, the signal ``y`` is padded so that frame
``t`` is centered at ``y[t * hop_length]``.
- If ``False``, then frame ``t`` begins at ``y[t * hop_length]``
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, STFT uses zero padding.
Returns
-------
S_out : np.ndarray [dtype=np.float]
- If ``S`` is provided as input, then ``S_out == S``
- Else, ``S_out = |stft(y, ...)|**power``
n_fft : int > 0
- If ``S`` is provided, then ``n_fft`` is inferred from ``S``
- Else, copied from input
"""
if S is not None:
# Infer n_fft from spectrogram shape, but only if it mismatches
if n_fft is None or n_fft // 2 + 1 != S.shape[-2]:
n_fft = 2 * (S.shape[-2] - 1)
else:
# Otherwise, compute a magnitude spectrogram from input
if n_fft is None:
raise ParameterError(f"Unable to compute spectrogram with n_fft={n_fft}")
if y is None:
raise ParameterError(
"Input signal must be provided to compute a spectrogram"
)
S = (
np.abs(
stft(
y,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
center=center,
window=window,
pad_mode=pad_mode,
)
)
** power
)
return S, n_fft