Spaces:

atsushieee
/

sovits-test

Running

App Files Files Community

sovits-test / pitch /core /salience.py

atsushieee

Upload folder using huggingface_hub

9791162 3 months ago

raw

history blame

14 kB

	"""
	\| Description: libf0 salience-based F0 estimation implementation
	\| Author: Sebastian Rosenzweig, Simon Schwär, Meinard Müller
	\| License: The MIT license, https://opensource.org/licenses/MIT
	\| This file is part of libf0.
	"""
	import numpy as np
	from librosa import stft
	from scipy import ndimage, linalg
	from numba import njit


	def salience(x, Fs=22050, N=2048, H=256, F_min=55.0, F_max=1760.0, R=10.0, num_harm=10, freq_smooth_len=11,
	alpha=0.9, gamma=0.0, constraint_region=None, tol=5, score_low=0.01, score_high=1.0):
	"""
	Implementation of a salience-based F0-estimation algorithm using pitch contours, inspired by Melodia.

	.. [#] Justin Salamon and Emilia Gómez,
	"Melody Extraction From Polyphonic Music Signals Using Pitch Contour Characteristics."
	IEEE Transactions on Audio, Speech, and Language Processing, vol. 20, no. 6, pp. 1759–1770, Aug. 2012.

	Parameters
	----------
	x : ndarray
	Audio signal
	Fs : int
	Sampling rate
	N : int
	Window size
	H : int
	Hop size
	F_min : float or int
	Minimal frequency
	F_max : float or int
	Maximal frequency
	R : int
	Frequency resolution given in cents
	num_harm : int
	Number of harmonics (Default value = 10)
	freq_smooth_len : int
	Filter length for vertical smoothing (Default value = 11)
	alpha : float
	Weighting parameter for harmonics (Default value = 0.9)
	gamma : float
	Logarithmic compression factor (Default value = 0.0)
	constraint_region : None or ndarray
	Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end,hz)
	(Default value = None)
	tol : int
	Tolerance parameter for transition matrix (Default value = 5)
	score_low : float
	Score (low) for transition matrix (Default value = 0.01)
	score_high : float
	Score (high) for transition matrix (Default value = 1.0)

	Returns
	-------
	f0 : ndarray
	Estimated F0-trajectory
	T_coef: ndarray
	Time axis
	sal: ndarray
	Salience value of estimated F0

	See also
	--------
	[FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb
	"""

	# compute salience representation via instantaneous frequency and harmonic summation
	Z, F_coef_hertz = compute_salience_rep(x, Fs, N=N, H=H, F_min=F_min, F_max=F_max, R=R,
	num_harm=num_harm, freq_smooth_len=freq_smooth_len,
	alpha=alpha, gamma=gamma)

	# compute trajectory via dynamic programming
	T_coef = (np.arange(Z.shape[1]) * H) / Fs
	index_CR = compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region,
	tol=tol, score_low=score_low, score_high=score_high)

	traj = F_coef_hertz[index_CR]
	traj[index_CR == -1] = 0

	# compute salience value
	Z_max = np.max(Z, axis=0)
	Z_norm = np.divide(Z, np.ones((Z.shape[0], 1)) * Z_max)
	sal = Z_norm[index_CR, np.arange(Z.shape[1])]
	sal[traj == 0] = 0

	return traj, T_coef, sal


	def compute_salience_rep(x, Fs, N, H, F_min, F_max, R, num_harm, freq_smooth_len, alpha, gamma):
	"""
	Compute salience representation [FMP, Eq. (8.56)]

	Parameters
	----------
	x : ndarray
	Audio signal
	Fs : int
	Sampling rate
	N : int
	Window size
	H : int
	Hop size
	F_min : float or int
	Minimal frequency
	F_max : float or int
	Maximal frequency
	R : int
	Frequency resolution given in cents
	num_harm : int
	Number of harmonics
	freq_smooth_len : int
	Filter length for vertical smoothing
	alpha : float
	Weighting parameter for harmonics
	gamma : float
	Logarithmic compression factor

	Returns
	-------
	Z : ndarray
	Salience representation
	F_coef_hertz : ndarray
	Frequency axis in Hz

	See also
	--------
	[FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb
	"""

	X = stft(x, n_fft=N, hop_length=H, win_length=N, pad_mode='constant')
	Y_LF_IF_bin, F_coef_hertz = compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R)

	# smoothing
	Y_LF_IF_bin = ndimage.convolve1d(Y_LF_IF_bin, np.hanning(freq_smooth_len), axis=0, mode='constant')

	Z = compute_salience_from_logfreq_spec(Y_LF_IF_bin, R, n_harmonics=num_harm, alpha=alpha, beta=1, gamma=gamma)
	return Z, F_coef_hertz


	def compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R):
	"""
	Binned Log-frequency Spectrogram with variable frequency resolution based on instantaneous frequency,
	more efficient implementation than FMP

	Parameters
	----------
	X : ndarray
	Complex spectrogram
	Fs : int
	Sampling rate in Hz
	N : int
	Window size
	H : int
	Hop size
	F_min : float or int
	Minimal frequency
	F_max : float or int
	Maximal frequency
	R : int
	Frequency resolution given in cents

	Returns
	-------
	Y_LF_IF_bin : ndarray
	Binned log-frequency spectrogram using instantaneous frequency (shape: [freq, time])
	F_coef_hertz : ndarray
	Frequency axis in Hz
	"""

	# calculate number of bins on log frequency axis
	B = frequency_to_bin_index(F_max, R, F_min) + 1

	# center frequencies of the final bins
	F_coef_hertz = F_min * np.power(2, (np.arange(0, B) * R / 1200))

	# calculate heterodyned phase increment (hpi)
	k = np.arange(X.shape[0]).reshape(-1, 1)
	omega = 2 * np.pi * k / N # center frequency for each bin in rad
	hpi = (np.angle(X[:, 1:]) - np.angle(X[:, 0:-1])) - omega * H

	# reduce hpi to -pi:pi range
	# this is much faster than using the modulo function below, but gives the same result
	# hpi = np.mod(hpi + np.pi, 2 * np.pi) - np.pi
	hpi = hpi - 2 * np.pi * (np.around((hpi / (2 * np.pi)) + 1) - 1)

	# calculate instantaneous frequencies in Hz
	inst_f = (omega + hpi / H) * Fs / (2 * np.pi)
	# repeat the first time frame to match dimensions of X
	inst_f = np.hstack((np.copy(inst_f[:, 0]).reshape(-1, 1), inst_f))

	# mask frequencies that are not relevant
	mask = np.logical_and(inst_f >= F_min, inst_f < F_max)
	inst_f *= mask
	# set 0 to nan, so it does stay at nan in the bin assignment calculation
	inst_f[np.where(inst_f == 0)] = np.nan

	# find which inst_f values belong to which bin
	bin_assignment = frequency_to_bin_index(inst_f, R, F_min)
	# we map the discarded values to an extra bin that we remove before returning the binned spectrogram
	bin_assignment[np.where(np.isnan(inst_f))] = B

	# perform binning on power spectrogram for each time frame separately
	Y = np.abs(X) ** 2
	Y_LF_IF_bin = np.zeros((B+1, Y.shape[1]))
	for t in range(Y.shape[1]):
	np.add.at(Y_LF_IF_bin[:, t], bin_assignment[:, t], Y[:, t])

	return Y_LF_IF_bin[:B, :], F_coef_hertz


	def compute_salience_from_logfreq_spec(lf_spec, R, n_harmonics, alpha, beta, gamma, harmonic_win_len=11):
	"""
	Compute salience representation using harmonic summation following [1]

	[1] J. Salamon and E. Gomez,
	"Melody Extraction From Polyphonic Music Signals Using Pitch Contour Characteristics."
	IEEE Transactions on Audio, Speech, and Language Processing, vol. 20, no. 6, pp. 1759–1770, Aug. 2012.

	Parameters
	----------
	lf_spec : ndarray
	(F, T) log-spectrogram
	R : int
	Frequency resolution given in cents
	n_harmonics : int
	Number of harmonics
	alpha : float
	Weighting parameter for harmonics
	beta : float
	Compression parameter for spectrogram magnitudes
	gamma : float
	Magnitude threshold
	harmonic_win_len : int
	Length of a frequency weighting window in bins

	Returns
	-------
	Z : ndarray
	(F, T) salience representation of the input spectrogram
	"""

	# magnitude thresholding and compression
	eps = np.finfo(np.float32).eps
	threshold_mask = (20 * np.log10(lf_spec/np.max(lf_spec) + eps)) < gamma
	lf_spec = lf_spec*beta threshold_mask

	# compute window
	max_diff_bins = harmonic_win_len // 2
	window = np.cos(np.linspace(-1, 1, 2max_diff_bins+1)np.pi/2)**2 # cosine^2 window

	# compute indices of harmonics
	harmonics = np.round(np.log2(np.arange(1, n_harmonics + 1)) * 1200 / R).astype(int)
	weighting_vec = np.zeros((lf_spec.shape[0] + max_diff_bins))

	# compute weights
	for idx, h in enumerate(harmonics):
	if h+harmonic_win_len > len(weighting_vec):
	break # we reached the maximum length available
	weighting_vec[h:h+harmonic_win_len] += window * alpha**idx

	# correlate lf_spec with the weighting vector on the frequency axis
	Z = ndimage.correlate1d(lf_spec, weighting_vec[:],
	axis=0, mode='constant', cval=0, origin=-len(weighting_vec)//2 + max_diff_bins)

	# magnitude thresholding and compression
	threshold_mask = (20 * np.log10(Z / np.max(Z) + eps)) < gamma
	Z = Z ** beta * threshold_mask

	return Z


	def define_transition_matrix(B, tol=0, score_low=0.01, score_high=1.0):
	"""
	Generate transition matrix for dynamic programming

	Parameters
	----------
	B : int
	Number of bins
	tol : int
	Tolerance parameter for transition matrix (Default value = 0)
	score_low : float
	Score (low) for transition matrix (Default value = 0.01)
	score_high : float
	Score (high) for transition matrix (Default value = 1.0)

	Returns
	-------
	T : ndarray
	(B, B) Transition matrix

	See also
	--------
	[FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb
	"""

	col = np.ones((B,)) * score_low
	col[0:tol+1] = np.ones((tol+1, )) * score_high
	T = linalg.toeplitz(col)
	return T


	@njit
	def compute_trajectory_dp(Z, T):
	"""
	Trajectory tracking using dynamic programming

	Parameters
	----------
	Z : ndarray
	Salience representation
	T : ndarray
	Transisition matrix

	Returns
	-------
	eta_DP : ndarray
	Trajectory indices

	See also
	--------
	[FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb
	"""

	B, N = Z.shape
	eps_machine = np.finfo(np.float32).eps
	Z_log = np.log(Z + eps_machine)
	T_log = np.log(T + eps_machine)

	E = np.zeros((B, N))
	D = np.zeros((B, N))
	D[:, 0] = Z_log[:, 0]

	for n in np.arange(1, N):
	for b in np.arange(0, B):
	D[b, n] = np.max(T_log[b, :] + D[:, n-1]) + Z_log[b, n]
	E[b, n-1] = np.argmax(T_log[b, :] + D[:, n-1])

	# backtracking
	eta_DP = np.zeros(N)
	eta_DP[N-1] = int(np.argmax(D[:, N-1]))

	for n in np.arange(N-2, -1, -1):
	eta_DP[n] = E[int(eta_DP[n+1]), n]

	return eta_DP.astype(np.int64)


	def compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region=None,
	tol=5, score_low=0.01, score_high=1.0):
	"""
	Trajectory tracking with constraint regions
	Notebook: C8/C8S2_FundFreqTracking.ipynb

	Parameters
	----------
	Z : ndarray
	Salience representation
	T_coef : ndarray
	Time axis
	F_coef_hertz : ndarray
	Frequency axis in Hz
	constraint_region : ndarray or None
	Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end_hz)
	(Default value = None)
	tol : int
	Tolerance parameter for transition matrix (Default value = 5)
	score_low : float
	Score (low) for transition matrix (Default value = 0.01)
	score_high : float
	Score (high) for transition matrix (Default value = 1.0)

	Returns
	-------
	eta : ndarray
	Trajectory indices, unvoiced frames are indicated with -1

	See also
	--------
	[FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb
	"""

	# do tracking within every constraint region
	if constraint_region is not None:
	# initialize contour, unvoiced frames are indicated with -1
	eta = np.full(len(T_coef), -1)

	for row_idx in range(constraint_region.shape[0]):
	t_start = constraint_region[row_idx, 0] # sec
	t_end = constraint_region[row_idx, 1] # sec
	f_start = constraint_region[row_idx, 2] # Hz
	f_end = constraint_region[row_idx, 3] # Hz

	# convert start/end values to indices
	t_start_idx = np.argmin(np.abs(T_coef - t_start))
	t_end_idx = np.argmin(np.abs(T_coef - t_end))
	f_start_idx = np.argmin(np.abs(F_coef_hertz - f_start))
	f_end_idx = np.argmin(np.abs(F_coef_hertz - f_end))

	# track in salience part
	cur_Z = Z[f_start_idx:f_end_idx+1, t_start_idx:t_end_idx+1]
	T = define_transition_matrix(cur_Z.shape[0], tol=tol,
	score_low=score_low, score_high=score_high)
	cur_eta = compute_trajectory_dp(cur_Z, T)

	# fill contour
	eta[t_start_idx:t_end_idx+1] = f_start_idx + cur_eta
	else:
	T = define_transition_matrix(Z.shape[0], tol=tol, score_low=score_low, score_high=score_high)
	eta = compute_trajectory_dp(Z, T)

	return eta


	def frequency_to_bin_index(F, R, F_ref):
	"""
	Binning function with variable frequency resolution
	Note: Indexing starts with 0 (opposed to [FMP, Eq. (8.49)])

	Parameters
	----------
	F : float or ndarray
	Frequency in Hz
	R : float
	Frequency resolution in cents (Default value = 10.0)
	F_ref : float
	Reference frequency in Hz (Default value = 55.0)

	Returns
	-------
	bin_index (int): Index for bin (starting with index 0)

	See also
	--------
	[FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb
	"""
	bin_index = np.floor((1200 / R) * np.log2(F / F_ref) + 0.5).astype(np.int64)
	return bin_index