Text-to-Speech

Build error

App Files Files Community

Text-to-Speech / evaluation /features /signal_to_noise_ratio.py

zyingt

Upload 685 files

0d80816 almost 2 years ago

raw

history blame contribute delete

4.15 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import numpy as np
	import scipy.signal as sig
	import copy
	import librosa


	def bandpower(ps, mode="time"):
	"""
	estimate bandpower, see https://de.mathworks.com/help/signal/ref/bandpower.html
	"""
	if mode == "time":
	x = ps
	l2norm = np.linalg.norm(x) ** 2.0 / len(x)
	return l2norm
	elif mode == "psd":
	return sum(ps)


	def getIndizesAroundPeak(arr, peakIndex, searchWidth=1000):
	peakBins = []
	magMax = arr[peakIndex]
	curVal = magMax
	for i in range(searchWidth):
	newBin = peakIndex + i
	if newBin >= len(arr):
	break
	newVal = arr[newBin]
	if newVal > curVal:
	break
	else:
	peakBins.append(int(newBin))
	curVal = newVal
	curVal = magMax
	for i in range(searchWidth):
	newBin = peakIndex - i
	if newBin < 0:
	break
	newVal = arr[newBin]
	if newVal > curVal:
	break
	else:
	peakBins.append(int(newBin))
	curVal = newVal
	return np.array(list(set(peakBins)))


	def freqToBin(fAxis, Freq):
	return np.argmin(abs(fAxis - Freq))


	def getPeakInArea(psd, faxis, estimation, searchWidthHz=10):
	"""
	returns bin and frequency of the maximum in an area
	"""
	binLow = freqToBin(faxis, estimation - searchWidthHz)
	binHi = freqToBin(faxis, estimation + searchWidthHz)
	peakbin = binLow + np.argmax(psd[binLow : binHi + 1])
	return peakbin, faxis[peakbin]


	def getHarmonics(fund, sr, nHarmonics=6, aliased=False):
	harmonicMultipliers = np.arange(2, nHarmonics + 2)
	harmonicFs = fund * harmonicMultipliers
	if not aliased:
	harmonicFs[harmonicFs > sr / 2] = -1
	harmonicFs = np.delete(harmonicFs, harmonicFs == -1)
	else:
	nyqZone = np.floor(harmonicFs / (sr / 2))
	oddEvenNyq = nyqZone % 2
	harmonicFs = np.mod(harmonicFs, sr / 2)
	harmonicFs[oddEvenNyq == 1] = (sr / 2) - harmonicFs[oddEvenNyq == 1]
	return harmonicFs


	def extract_snr(audio, sr=None):
	"""Extract Signal-to-Noise Ratio for a given audio."""
	if sr != None:
	audio, _ = librosa.load(audio, sr=sr)
	else:
	audio, sr = librosa.load(audio, sr=sr)
	faxis, ps = sig.periodogram(
	audio, fs=sr, window=("kaiser", 38)
	) # get periodogram, parametrized like in matlab
	fundBin = np.argmax(
	ps
	) # estimate fundamental at maximum amplitude, get the bin number
	fundIndizes = getIndizesAroundPeak(
	ps, fundBin
	) # get bin numbers around fundamental peak
	fundFrequency = faxis[fundBin] # frequency of fundamental

	nHarmonics = 18
	harmonicFs = getHarmonics(
	fundFrequency, sr, nHarmonics=nHarmonics, aliased=True
	) # get harmonic frequencies

	harmonicBorders = np.zeros([2, nHarmonics], dtype=np.int16).T
	fullHarmonicBins = np.array([], dtype=np.int16)
	fullHarmonicBinList = []
	harmPeakFreqs = []
	harmPeaks = []
	for i, harmonic in enumerate(harmonicFs):
	searcharea = 0.1 * fundFrequency
	estimation = harmonic

	binNum, freq = getPeakInArea(ps, faxis, estimation, searcharea)
	harmPeakFreqs.append(freq)
	harmPeaks.append(ps[binNum])
	allBins = getIndizesAroundPeak(ps, binNum, searchWidth=1000)
	fullHarmonicBins = np.append(fullHarmonicBins, allBins)
	fullHarmonicBinList.append(allBins)
	harmonicBorders[i, :] = [allBins[0], allBins[-1]]

	fundIndizes.sort()
	pFund = bandpower(ps[fundIndizes[0] : fundIndizes[-1]]) # get power of fundamental

	noisePrepared = copy.copy(ps)
	noisePrepared[fundIndizes] = 0
	noisePrepared[fullHarmonicBins] = 0
	noiseMean = np.median(noisePrepared[noisePrepared != 0])
	noisePrepared[fundIndizes] = noiseMean
	noisePrepared[fullHarmonicBins] = noiseMean

	noisePower = bandpower(noisePrepared)

	r = 10 * np.log10(pFund / noisePower)

	return r, 10 * np.log10(noisePower)