""" | |
| Description: libf0 salience-based F0 estimation implementation | |
| Author: Sebastian Rosenzweig, Simon Schwär, Meinard Müller | |
| License: The MIT license, | |
| This file is part of libf0. | |
""" | |
import numpy as np | |
from librosa import stft | |
from scipy import ndimage, linalg | |
from numba import njit | |
def salience(x, Fs=22050, N=2048, H=256, F_min=55.0, F_max=1760.0, R=10.0, num_harm=10, freq_smooth_len=11, | |
alpha=0.9, gamma=0.0, constraint_region=None, tol=5, score_low=0.01, score_high=1.0): | |
""" | |
Implementation of a salience-based F0-estimation algorithm using pitch contours, inspired by Melodia. | |
.. [#] Justin Salamon and Emilia Gómez, | |
"Melody Extraction From Polyphonic Music Signals Using Pitch Contour Characteristics." | |
IEEE Transactions on Audio, Speech, and Language Processing, vol. 20, no. 6, pp. 1759–1770, Aug. 2012. | |
Parameters | |
---------- | |
x : ndarray | |
Audio signal | |
Fs : int | |
Sampling rate | |
N : int | |
Window size | |
H : int | |
Hop size | |
F_min : float or int | |
Minimal frequency | |
F_max : float or int | |
Maximal frequency | |
R : int | |
Frequency resolution given in cents | |
num_harm : int | |
Number of harmonics (Default value = 10) | |
freq_smooth_len : int | |
Filter length for vertical smoothing (Default value = 11) | |
alpha : float | |
Weighting parameter for harmonics (Default value = 0.9) | |
gamma : float | |
Logarithmic compression factor (Default value = 0.0) | |
constraint_region : None or ndarray | |
Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end,hz) | |
(Default value = None) | |
tol : int | |
Tolerance parameter for transition matrix (Default value = 5) | |
score_low : float | |
Score (low) for transition matrix (Default value = 0.01) | |
score_high : float | |
Score (high) for transition matrix (Default value = 1.0) | |
Returns | |
------- | |
f0 : ndarray | |
Estimated F0-trajectory | |
T_coef: ndarray | |
Time axis | |
sal: ndarray | |
Salience value of estimated F0 | |
See also | |
-------- | |
[FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb | |
""" | |
# compute salience representation via instantaneous frequency and harmonic summation | |
Z, F_coef_hertz = compute_salience_rep(x, Fs, N=N, H=H, F_min=F_min, F_max=F_max, R=R, | |
num_harm=num_harm, freq_smooth_len=freq_smooth_len, | |
alpha=alpha, gamma=gamma) | |
# compute trajectory via dynamic programming | |
T_coef = (np.arange(Z.shape[1]) * H) / Fs | |
index_CR = compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region, | |
tol=tol, score_low=score_low, score_high=score_high) | |
traj = F_coef_hertz[index_CR] | |
traj[index_CR == -1] = 0 | |
# compute salience value | |
Z_max = np.max(Z, axis=0) | |
Z_norm = np.divide(Z, np.ones((Z.shape[0], 1)) * Z_max) | |
sal = Z_norm[index_CR, np.arange(Z.shape[1])] | |
sal[traj == 0] = 0 | |
return traj, T_coef, sal | |
def compute_salience_rep(x, Fs, N, H, F_min, F_max, R, num_harm, freq_smooth_len, alpha, gamma): | |
""" | |
Compute salience representation [FMP, Eq. (8.56)] | |
Parameters | |
---------- | |
x : ndarray | |
Audio signal | |
Fs : int | |
Sampling rate | |
N : int | |
Window size | |
H : int | |
Hop size | |
F_min : float or int | |
Minimal frequency | |
F_max : float or int | |
Maximal frequency | |
R : int | |
Frequency resolution given in cents | |
num_harm : int | |
Number of harmonics | |
freq_smooth_len : int | |
Filter length for vertical smoothing | |
alpha : float | |
Weighting parameter for harmonics | |
gamma : float | |
Logarithmic compression factor | |
Returns | |
------- | |
Z : ndarray | |
Salience representation | |
F_coef_hertz : ndarray | |
Frequency axis in Hz | |
See also | |
-------- | |
[FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb | |
""" | |
X = stft(x, n_fft=N, hop_length=H, win_length=N, pad_mode='constant') | |
Y_LF_IF_bin, F_coef_hertz = compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R) | |
# smoothing | |
Y_LF_IF_bin = ndimage.convolve1d(Y_LF_IF_bin, np.hanning(freq_smooth_len), axis=0, mode='constant') | |
Z = compute_salience_from_logfreq_spec(Y_LF_IF_bin, R, n_harmonics=num_harm, alpha=alpha, beta=1, gamma=gamma) | |
return Z, F_coef_hertz | |
def compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R): | |
""" | |
Binned Log-frequency Spectrogram with variable frequency resolution based on instantaneous frequency, | |
more efficient implementation than FMP | |
Parameters | |
---------- | |
X : ndarray | |
Complex spectrogram | |
Fs : int | |
Sampling rate in Hz | |
N : int | |
Window size | |
H : int | |
Hop size | |
F_min : float or int | |
Minimal frequency | |
F_max : float or int | |
Maximal frequency | |
R : int | |
Frequency resolution given in cents | |
Returns | |
------- | |
Y_LF_IF_bin : ndarray | |
Binned log-frequency spectrogram using instantaneous frequency (shape: [freq, time]) | |
F_coef_hertz : ndarray | |
Frequency axis in Hz | |
""" | |
# calculate number of bins on log frequency axis | |
B = frequency_to_bin_index(F_max, R, F_min) + 1 | |
# center frequencies of the final bins | |
F_coef_hertz = F_min * np.power(2, (np.arange(0, B) * R / 1200)) | |
# calculate heterodyned phase increment (hpi) | |
k = np.arange(X.shape[0]).reshape(-1, 1) | |
omega = 2 * np.pi * k / N # center frequency for each bin in rad | |
hpi = (np.angle(X[:, 1:]) - np.angle(X[:, 0:-1])) - omega * H | |
# reduce hpi to -pi:pi range | |
# this is much faster than using the modulo function below, but gives the same result | |
# hpi = np.mod(hpi + np.pi, 2 * np.pi) - np.pi | |
hpi = hpi - 2 * np.pi * (np.around((hpi / (2 * np.pi)) + 1) - 1) | |
# calculate instantaneous frequencies in Hz | |
inst_f = (omega + hpi / H) * Fs / (2 * np.pi) | |
# repeat the first time frame to match dimensions of X | |
inst_f = np.hstack((np.copy(inst_f[:, 0]).reshape(-1, 1), inst_f)) | |
# mask frequencies that are not relevant | |
mask = np.logical_and(inst_f >= F_min, inst_f < F_max) | |
inst_f *= mask | |
# set 0 to nan, so it does stay at nan in the bin assignment calculation | |
inst_f[np.where(inst_f == 0)] = np.nan | |
# find which inst_f values belong to which bin | |
bin_assignment = frequency_to_bin_index(inst_f, R, F_min) | |
# we map the discarded values to an extra bin that we remove before returning the binned spectrogram | |
bin_assignment[np.where(np.isnan(inst_f))] = B | |
# perform binning on power spectrogram for each time frame separately | |
Y = np.abs(X) ** 2 | |
Y_LF_IF_bin = np.zeros((B+1, Y.shape[1])) | |
for t in range(Y.shape[1]): | |[:, t], bin_assignment[:, t], Y[:, t]) | |
return Y_LF_IF_bin[:B, :], F_coef_hertz | |
def compute_salience_from_logfreq_spec(lf_spec, R, n_harmonics, alpha, beta, gamma, harmonic_win_len=11): | |
""" | |
Compute salience representation using harmonic summation following [1] | |
[1] J. Salamon and E. Gomez, | |
"Melody Extraction From Polyphonic Music Signals Using Pitch Contour Characteristics." | |
IEEE Transactions on Audio, Speech, and Language Processing, vol. 20, no. 6, pp. 1759–1770, Aug. 2012. | |
Parameters | |
---------- | |
lf_spec : ndarray | |
(F, T) log-spectrogram | |
R : int | |
Frequency resolution given in cents | |
n_harmonics : int | |
Number of harmonics | |
alpha : float | |
Weighting parameter for harmonics | |
beta : float | |
Compression parameter for spectrogram magnitudes | |
gamma : float | |
Magnitude threshold | |
harmonic_win_len : int | |
Length of a frequency weighting window in bins | |
Returns | |
------- | |
Z : ndarray | |
(F, T) salience representation of the input spectrogram | |
""" | |
# magnitude thresholding and compression | |
eps = np.finfo(np.float32).eps | |
threshold_mask = (20 * np.log10(lf_spec/np.max(lf_spec) + eps)) < gamma | |
lf_spec = lf_spec**beta * threshold_mask | |
# compute window | |
max_diff_bins = harmonic_win_len // 2 | |
window = np.cos(np.linspace(-1, 1, 2*max_diff_bins+1)*np.pi/2)**2 # cosine^2 window | |
# compute indices of harmonics | |
harmonics = np.round(np.log2(np.arange(1, n_harmonics + 1)) * 1200 / R).astype(int) | |
weighting_vec = np.zeros((lf_spec.shape[0] + max_diff_bins)) | |
# compute weights | |
for idx, h in enumerate(harmonics): | |
if h+harmonic_win_len > len(weighting_vec): | |
break # we reached the maximum length available | |
weighting_vec[h:h+harmonic_win_len] += window * alpha**idx | |
# correlate lf_spec with the weighting vector on the frequency axis | |
Z = ndimage.correlate1d(lf_spec, weighting_vec[:], | |
axis=0, mode='constant', cval=0, origin=-len(weighting_vec)//2 + max_diff_bins) | |
# magnitude thresholding and compression | |
threshold_mask = (20 * np.log10(Z / np.max(Z) + eps)) < gamma | |
Z = Z ** beta * threshold_mask | |
return Z | |
def define_transition_matrix(B, tol=0, score_low=0.01, score_high=1.0): | |
""" | |
Generate transition matrix for dynamic programming | |
Parameters | |
---------- | |
B : int | |
Number of bins | |
tol : int | |
Tolerance parameter for transition matrix (Default value = 0) | |
score_low : float | |
Score (low) for transition matrix (Default value = 0.01) | |
score_high : float | |
Score (high) for transition matrix (Default value = 1.0) | |
Returns | |
------- | |
T : ndarray | |
(B, B) Transition matrix | |
See also | |
-------- | |
[FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb | |
""" | |
col = np.ones((B,)) * score_low | |
col[0:tol+1] = np.ones((tol+1, )) * score_high | |
T = linalg.toeplitz(col) | |
return T | |
def compute_trajectory_dp(Z, T): | |
""" | |
Trajectory tracking using dynamic programming | |
Parameters | |
---------- | |
Z : ndarray | |
Salience representation | |
T : ndarray | |
Transisition matrix | |
Returns | |
------- | |
eta_DP : ndarray | |
Trajectory indices | |
See also | |
-------- | |
[FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb | |
""" | |
B, N = Z.shape | |
eps_machine = np.finfo(np.float32).eps | |
Z_log = np.log(Z + eps_machine) | |
T_log = np.log(T + eps_machine) | |
E = np.zeros((B, N)) | |
D = np.zeros((B, N)) | |
D[:, 0] = Z_log[:, 0] | |
for n in np.arange(1, N): | |
for b in np.arange(0, B): | |
D[b, n] = np.max(T_log[b, :] + D[:, n-1]) + Z_log[b, n] | |
E[b, n-1] = np.argmax(T_log[b, :] + D[:, n-1]) | |
# backtracking | |
eta_DP = np.zeros(N) | |
eta_DP[N-1] = int(np.argmax(D[:, N-1])) | |
for n in np.arange(N-2, -1, -1): | |
eta_DP[n] = E[int(eta_DP[n+1]), n] | |
return eta_DP.astype(np.int64) | |
def compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region=None, | |
tol=5, score_low=0.01, score_high=1.0): | |
""" | |
Trajectory tracking with constraint regions | |
Notebook: C8/C8S2_FundFreqTracking.ipynb | |
Parameters | |
---------- | |
Z : ndarray | |
Salience representation | |
T_coef : ndarray | |
Time axis | |
F_coef_hertz : ndarray | |
Frequency axis in Hz | |
constraint_region : ndarray or None | |
Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end_hz) | |
(Default value = None) | |
tol : int | |
Tolerance parameter for transition matrix (Default value = 5) | |
score_low : float | |
Score (low) for transition matrix (Default value = 0.01) | |
score_high : float | |
Score (high) for transition matrix (Default value = 1.0) | |
Returns | |
------- | |
eta : ndarray | |
Trajectory indices, unvoiced frames are indicated with -1 | |
See also | |
-------- | |
[FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb | |
""" | |
# do tracking within every constraint region | |
if constraint_region is not None: | |
# initialize contour, unvoiced frames are indicated with -1 | |
eta = np.full(len(T_coef), -1) | |
for row_idx in range(constraint_region.shape[0]): | |
t_start = constraint_region[row_idx, 0] # sec | |
t_end = constraint_region[row_idx, 1] # sec | |
f_start = constraint_region[row_idx, 2] # Hz | |
f_end = constraint_region[row_idx, 3] # Hz | |
# convert start/end values to indices | |
t_start_idx = np.argmin(np.abs(T_coef - t_start)) | |
t_end_idx = np.argmin(np.abs(T_coef - t_end)) | |
f_start_idx = np.argmin(np.abs(F_coef_hertz - f_start)) | |
f_end_idx = np.argmin(np.abs(F_coef_hertz - f_end)) | |
# track in salience part | |
cur_Z = Z[f_start_idx:f_end_idx+1, t_start_idx:t_end_idx+1] | |
T = define_transition_matrix(cur_Z.shape[0], tol=tol, | |
score_low=score_low, score_high=score_high) | |
cur_eta = compute_trajectory_dp(cur_Z, T) | |
# fill contour | |
eta[t_start_idx:t_end_idx+1] = f_start_idx + cur_eta | |
else: | |
T = define_transition_matrix(Z.shape[0], tol=tol, score_low=score_low, score_high=score_high) | |
eta = compute_trajectory_dp(Z, T) | |
return eta | |
def frequency_to_bin_index(F, R, F_ref): | |
""" | |
Binning function with variable frequency resolution | |
Note: Indexing starts with 0 (opposed to [FMP, Eq. (8.49)]) | |
Parameters | |
---------- | |
F : float or ndarray | |
Frequency in Hz | |
R : float | |
Frequency resolution in cents (Default value = 10.0) | |
F_ref : float | |
Reference frequency in Hz (Default value = 55.0) | |
Returns | |
------- | |
bin_index (int): Index for bin (starting with index 0) | |
See also | |
-------- | |
[FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb | |
""" | |
bin_index = np.floor((1200 / R) * np.log2(F / F_ref) + 0.5).astype(np.int64) | |
return bin_index | |