diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c09814d9486ac50e9422bb4d0f4176ccb194fd03 --- /dev/null +++ b/__init__.py @@ -0,0 +1,169 @@ +class Metric: + def __init__(self, name, window, hop=None, verbose=False): + # the metric operates on some fixed rate only or only on mono ? + self.fixed_rate = None + self.mono = False + + # is the metric absolute or relative ? + self.absolute = False + + # length and hop of windows + self.window = window + if hop is None: + hop = window + self.hop = hop + self.name = name + self.verbose = verbose + + def test_window(self, audios, rate): + raise NotImplementedError + + def test(self, *test_files, array_rate=None): + """loading sound files and making sure they all have the same lengths + (zero-padding to the largest). Also works with numpy arrays. + Then, calling the `test_window` function that should be specialised + depending on the metric.""" + + # imports + import soundfile as sf + import resampy + from museval.metrics import Framing + import numpy as np + + audios = [] + maxlen = 0 + if isinstance(test_files, str): + test_files = [test_files] + if self.absolute and len(test_files) > 1: + if self.verbose: + print(' [%s] is absolute. Processing first file only' + % self.name) + test_files = [test_files[0],] + + for file in test_files: + # Loading sound file + if isinstance(file, str): + audio, rate = sf.read(file, always_2d=True) + else: + rate = array_rate + if rate is None: + raise ValueError('Sampling rate needs to be specified ' + 'when feeding numpy arrays.') + audio = file + # Standardize shapes + if len(audio.shape) == 1: + audio = audio[:, None] + if len(audio.shape) != 2: + raise ValueError('Please provide 1D or 2D array, received ' + '{}D array'.format(len(audio.shape))) + + if self.fixed_rate is not None and rate != self.fixed_rate: + if self.verbose: + print(' [%s] preferred is %dkHz rate. resampling' + % (self.name, self.fixed_rate)) + audio = resampy.resample(audio, rate, self.fixed_rate, axis=0) + rate = self.fixed_rate + if self.mono and audio.shape[1] > 1: + if self.verbose: + print(' [%s] only supports mono. Will use first channel' + % self.name) + audio = audio[..., 0, None] + if self.mono: + audio = audio[..., 0] + maxlen = max(maxlen, audio.shape[0]) + audios += [audio] + + for index, audio in enumerate(audios): + if audio.shape[0] != maxlen: + new = np.zeros((maxlen,) + audio.shape[1:]) + new[:audio.shape[0]] = audio + audios[index] = new + + if self.window is not None: + framer = Framing(self.window * rate, + self.hop * rate, maxlen) + nwin = framer.nwin + result = {} + for (t, win) in enumerate(framer): + result_t = self.test_window([audio[win] for audio in audios], + rate) + for metric in result_t.keys(): + if metric not in result.keys(): + result[metric] = np.empty(nwin) + result[metric][t] = result_t[metric] + else: + result = self.test_window(audios, rate) + return result + + +import absolute +import relative + + +class MetricsList: + def __init__(self): + self.metrics = [] + + def __add__(self, metric): + self.metrics += [metric] + return self + + def __str__(self): + return 'Metrics: ' + ' '.join([x.name for x in self.metrics]) + + def __call__(self, *files, rate=None): + result = {} + for metric in self.metrics: + result_metric = metric.test(*files, array_rate=rate) + for name in result_metric.keys(): + result[name] = result_metric[name] + return result + + +def load(metrics='', window=2, verbose=False): + """ Load the desired metrics inside a Metrics object that can then + be called to compute all the desired metrics. + + Parameters: + ---------- + metrics: str or list of str + the metrics matching any of these will be automatically loaded. this + match is relative to the structure of the speechmetrics package. + For instance: + * 'absolute' will match all absolute metrics + * 'absolute.srmr' or 'srmr' will only match SRMR + * '' will match all + + window: float + the window length to use for testing the files. + + verbose: boolean + will display information during computations + + Returns: + -------- + + A MetricsList object, that can be run to get the desired metrics + """ + import pkgutil + import importlib + + result = MetricsList() + + found_modules = [] + iterator = pkgutil.walk_packages(__path__, __name__ + '.') + + if isinstance(metrics, str): + metrics = [metrics] + for module_info in iterator: + if any([metric in module_info.name for metric in metrics]): + module = importlib.import_module(module_info.name) + if module not in found_modules: + found_modules += [module], + if hasattr(module, 'load'): + load_function = getattr(module, 'load') + new_metric = load_function(window) + new_metric.verbose = verbose + result += new_metric + print('Loaded ', module_info.name) + return result diff --git a/__pycache__/basis.cpython-38.pyc b/__pycache__/basis.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea7bc745141515177444e76c03388aa4c850d553 Binary files /dev/null and b/__pycache__/basis.cpython-38.pyc differ diff --git a/__pycache__/metric_loader.cpython-38.pyc b/__pycache__/metric_loader.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d05fc1493e0cd66de4ef52326a9cee7fc4292331 Binary files /dev/null and b/__pycache__/metric_loader.cpython-38.pyc differ diff --git a/__pycache__/metrics.cpython-38.pyc b/__pycache__/metrics.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a7b0056b5bdaf4ebfd941a7370c59975720acfa Binary files /dev/null and b/__pycache__/metrics.cpython-38.pyc differ diff --git a/__pycache__/speechscore.cpython-38.pyc b/__pycache__/speechscore.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae61aff9bd36c3c8424db7db13f9c44291978a59 Binary files /dev/null and b/__pycache__/speechscore.cpython-38.pyc differ diff --git a/audios/clean/audio_1.wav b/audios/clean/audio_1.wav new file mode 100644 index 0000000000000000000000000000000000000000..acd41d940ef722dc3d7c5356acd0ab6a1c17aafa Binary files /dev/null and b/audios/clean/audio_1.wav differ diff --git a/audios/clean/audio_2.wav b/audios/clean/audio_2.wav new file mode 100644 index 0000000000000000000000000000000000000000..7772afa2ef5f6881d5b16cf0c301fc015b88c7b7 Binary files /dev/null and b/audios/clean/audio_2.wav differ diff --git a/audios/noisy/audio_1.wav b/audios/noisy/audio_1.wav new file mode 100644 index 0000000000000000000000000000000000000000..425d7c04381e56f0b78947743cfa35ca0d99ac17 Binary files /dev/null and b/audios/noisy/audio_1.wav differ diff --git a/audios/noisy/audio_2.wav b/audios/noisy/audio_2.wav new file mode 100644 index 0000000000000000000000000000000000000000..e7d44822410a3a64ddf0d6283787b232679190bf Binary files /dev/null and b/audios/noisy/audio_2.wav differ diff --git a/audios/ref.wav b/audios/ref.wav new file mode 100644 index 0000000000000000000000000000000000000000..acd41d940ef722dc3d7c5356acd0ab6a1c17aafa Binary files /dev/null and b/audios/ref.wav differ diff --git a/audios/test.wav b/audios/test.wav new file mode 100644 index 0000000000000000000000000000000000000000..425d7c04381e56f0b78947743cfa35ca0d99ac17 Binary files /dev/null and b/audios/test.wav differ diff --git a/basis.py b/basis.py new file mode 100644 index 0000000000000000000000000000000000000000..b52cec0c51765ce9d4d30a67f00879ee452ab2d0 --- /dev/null +++ b/basis.py @@ -0,0 +1,113 @@ +class ScoreBasis: + def __init__(self, name=None): + # the score operates on the specified rate + self.score_rate = None + # is the score intrusive or non-intrusive ? + self.intrusive = True #require a reference + self.name = name + + def windowed_scoring(self, audios, score_rate): + raise NotImplementedError(f'In {self.name}, windowed_scoring is not yet implemented') + + def scoring(self, data, window=None, score_rate=None): + """ calling the `windowed_scoring` function that should be specialised + depending on the score.""" + + # imports + #import soundfile as sf + import resampy + from museval.metrics import Framing + + #checking rate + audios = data['audio'] + score_rate = data['rate'] + + if self.score_rate is not None: + score_rate = self.score_rate + + if score_rate != data['rate']: + for index, audio in enumerate(audios): + audio = resampy.resample(audio, data['rate'], score_rate, axis=0) + audios[index] = audio + + if window is not None: + framer = Framing(window * score_rate, window * score_rate, maxlen) + nwin = framer.nwin + result = {} + for (t, win) in enumerate(framer): + result_t = self.windowed_scoring([audio[win] for audio in audios], score_rate) + result[t] = result_t + else: + result = self.windowed_scoring(audios, score_rate) + return result + """ + audios = [] + maxlen = 0 + if isinstance(test_files, str): + test_files = [test_files] + print(f'test_files: {test_files}') + if not self.intrusive and len(test_files) > 1: + if self.verbose: + print(' [%s] is non-intrusive. Processing first file only' + % self.name) + test_files = [test_files[0],] + for file in test_files: + # Loading sound file + if isinstance(file, str): + audio, rate = sf.read(file, always_2d=True) + else: + rate = array_rate + if rate is None: + raise ValueError('Sampling rate needs to be specified ' + 'when feeding numpy arrays.') + audio = file + # Standardize shapes + if len(audio.shape) == 1: + audio = audio[:, None] + if len(audio.shape) != 2: + raise ValueError('Please provide 1D or 2D array, received ' + '{}D array'.format(len(audio.shape))) + + if self.fixed_rate is not None and rate != self.fixed_rate: + if self.verbose: + print(' [%s] preferred is %dkHz rate. resampling' + % (self.name, self.fixed_rate)) + audio = resampy.resample(audio, rate, self.fixed_rate, axis=0) + rate = self.fixed_rate + if self.mono and audio.shape[1] > 1: + if self.verbose: + print(' [%s] only supports mono. Will use first channel' + % self.name) + audio = audio[..., 0, None] + if self.mono: + audio = audio[..., 0] + maxlen = max(maxlen, audio.shape[0]) + audios += [audio] + audio = audios[1] + audio[:maxlen-320] = audio[320:] + audios[1] = audio + for index, audio in enumerate(audios): + if audio.shape[0] != maxlen: + new = np.zeros((maxlen,) + audio.shape[1:]) + new[:audio.shape[0]] = audio + audios[index] = new + + if self.window is not None: + framer = Framing(self.window * rate, + self.hop * rate, maxlen) + nwin = framer.nwin + result = {} + for (t, win) in enumerate(framer): + result_t = self.test_window([audio[win] for audio in audios], + rate) + #or metric in result_t.keys(): + # if metric not in result.keys(): + # result[metric] = np.empty(nwin) + # result[metric][t] = result_t[metric] + result[t] = result_t + else: + result = self.test_window(audios, rate) + return result + """ + + diff --git a/demo.py b/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..86bd4c124f4e373fb003cc65a7a8e228360eab47 --- /dev/null +++ b/demo.py @@ -0,0 +1,29 @@ +# Import pprint for pretty-printing the results in a more readable format +import pprint +# Import the SpeechScore class to evaluate speech quality metrics +from speechscore import SpeechScore + +# Main block to ensure the code runs only when executed directly +if __name__ == '__main__': + # Initialize a SpeechScore object with a list of score metrics to be evaluated + # Supports any subsets of the list + mySpeechScore = SpeechScore([ + 'SRMR', 'PESQ', 'NB_PESQ', 'STOI', 'SISDR', + 'FWSEGSNR', 'LSD', 'BSSEval', 'DNSMOS', + 'SNR', 'SSNR', 'LLR', 'CSIG', 'CBAK', + 'COVL', 'MCD' + ]) + + # Call the SpeechScore object to evaluate the speech metrics between 'noisy' and 'clean' audio + # Arguments: + # - {test_path, reference_path} supports audio directories or audio paths (.wav or .flac) + # - window (float): seconds, set None to specify no windowing (process the full audio) + # - score_rate (int): specifies the sampling rate at which the metrics should be computed + # - return_mean (bool): set True to specify that the mean score for each metric should be returned + scores = mySpeechScore(test_path='audios/noisy/', reference_path='audios/clean/', window=None, score_rate=16000, return_mean=True) + + # Pretty-print the resulting scores in a readable format + pprint.pprint(scores) + + # Print only the resulting mean scores in a readable format + pprint.pprint(scores['Mean_Score']) diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f1049727880c941f324adb2b51fecea317ea44a --- /dev/null +++ b/requirement.txt @@ -0,0 +1,5 @@ +pysptk +pymcd +pyworld +fastdtw +museval diff --git a/scores/__init__.py b/scores/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scores/__pycache__/__init__.cpython-38.pyc b/scores/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c9133603b894bbefa859de058c1b5c2edd399db Binary files /dev/null and b/scores/__pycache__/__init__.cpython-38.pyc differ diff --git a/scores/__pycache__/bsseval.cpython-38.pyc b/scores/__pycache__/bsseval.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8a68fc432e2d3ca8c2775cb0e7e71bb602da8f4 Binary files /dev/null and b/scores/__pycache__/bsseval.cpython-38.pyc differ diff --git a/scores/__pycache__/cbak.cpython-38.pyc b/scores/__pycache__/cbak.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25e8db7f234326faa8c7d70a329b8bb9544aef43 Binary files /dev/null and b/scores/__pycache__/cbak.cpython-38.pyc differ diff --git a/scores/__pycache__/covl.cpython-38.pyc b/scores/__pycache__/covl.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c453f99db584c7c3cfa39ca30168f83863d57cff Binary files /dev/null and b/scores/__pycache__/covl.cpython-38.pyc differ diff --git a/scores/__pycache__/csig.cpython-38.pyc b/scores/__pycache__/csig.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21f3655bb7c0948f1bbd3c97835abb146991666d Binary files /dev/null and b/scores/__pycache__/csig.cpython-38.pyc differ diff --git a/scores/__pycache__/fwsegsnr.cpython-38.pyc b/scores/__pycache__/fwsegsnr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1daa9da0bc041ad9dff863d48f34c75a7869dffe Binary files /dev/null and b/scores/__pycache__/fwsegsnr.cpython-38.pyc differ diff --git a/scores/__pycache__/helper.cpython-38.pyc b/scores/__pycache__/helper.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2742d24b9aa601045c1410087df9d98b9a48f612 Binary files /dev/null and b/scores/__pycache__/helper.cpython-38.pyc differ diff --git a/scores/__pycache__/llr.cpython-38.pyc b/scores/__pycache__/llr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44a56daff14f3b3614771ec53b015204b06af7a9 Binary files /dev/null and b/scores/__pycache__/llr.cpython-38.pyc differ diff --git a/scores/__pycache__/lsd.cpython-38.pyc b/scores/__pycache__/lsd.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58d2614bdd343876ec59c5225e2ca6ba92e82d46 Binary files /dev/null and b/scores/__pycache__/lsd.cpython-38.pyc differ diff --git a/scores/__pycache__/mcd.cpython-38.pyc b/scores/__pycache__/mcd.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3a37c67f4a9d42741c0e66214d23b8384089c07 Binary files /dev/null and b/scores/__pycache__/mcd.cpython-38.pyc differ diff --git a/scores/__pycache__/nb_pesq.cpython-38.pyc b/scores/__pycache__/nb_pesq.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4ba23b8074f96a87b7c1c643a4029c8c1ef0536 Binary files /dev/null and b/scores/__pycache__/nb_pesq.cpython-38.pyc differ diff --git a/scores/__pycache__/pesq.cpython-38.pyc b/scores/__pycache__/pesq.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6cac5401940decc3c733b74ecfc5d0e43b714aa Binary files /dev/null and b/scores/__pycache__/pesq.cpython-38.pyc differ diff --git a/scores/__pycache__/sisdr.cpython-38.pyc b/scores/__pycache__/sisdr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5366ecb19c849097122585cdf73c28cd2938c5c Binary files /dev/null and b/scores/__pycache__/sisdr.cpython-38.pyc differ diff --git a/scores/__pycache__/snr.cpython-38.pyc b/scores/__pycache__/snr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..531ac2aaad7fa43d9285679795df1a6176666fc6 Binary files /dev/null and b/scores/__pycache__/snr.cpython-38.pyc differ diff --git a/scores/__pycache__/ssnr.cpython-38.pyc b/scores/__pycache__/ssnr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a95e66a13bb833b15bda4bdafa8aa7fb7e93f3e2 Binary files /dev/null and b/scores/__pycache__/ssnr.cpython-38.pyc differ diff --git a/scores/__pycache__/stoi.cpython-38.pyc b/scores/__pycache__/stoi.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..175f433d17106d06c8463e38746a5922b8d5521c Binary files /dev/null and b/scores/__pycache__/stoi.cpython-38.pyc differ diff --git a/scores/bsseval.py b/scores/bsseval.py new file mode 100644 index 0000000000000000000000000000000000000000..e4af1f0456ce49041147bda11c538a08eddb8ff4 --- /dev/null +++ b/scores/bsseval.py @@ -0,0 +1,21 @@ +import numpy as np +from basis import ScoreBasis + + +class BSSEval(ScoreBasis): + def __init__(self): + super(BSSEval, self).__init__(name='BSSEval') + self.intrusive = False + + def windowed_scoring(self, audios, score_rate): + bss_window = np.inf + bss_hop = np.inf + from museval.metrics import bss_eval + if len(audios) != 2: + raise ValueError('BSSEval needs a reference and a test signals.') + + result = bss_eval(reference_sources=audios[1][None,...], # shape: [nsrc, nsample, nchannels] + estimated_sources=audios[0][None,...], + window=bss_window * score_rate, + hop=bss_hop * score_rate) + return {'SDR': result[0][0][0], 'ISR': result[1][0][0], 'SAR': result[3][0][0]} diff --git a/scores/cbak.py b/scores/cbak.py new file mode 100644 index 0000000000000000000000000000000000000000..f50f125176881a2b2a196ac584d11741c0cd5233 --- /dev/null +++ b/scores/cbak.py @@ -0,0 +1,37 @@ +from basis import ScoreBasis +import numpy as np +from pesq import pesq +from scores.helper import wss, llr, SSNR, trim_mos + +class CBAK(ScoreBasis): + def __init__(self): + super(CBAK, self).__init__(name='CBAK') + self.score_rate = 16000 + self.intrusive = False + + def windowed_scoring(self, audios, score_rate): + if len(audios) != 2: + raise ValueError('CBAK needs a reference and a test signals.') + return cal_CBAK(audios[0], audios[1], score_rate) + +def cal_CBAK(target_wav, pred_wav, fs): + alpha = 0.95 + + # Compute WSS measure + wss_dist_vec = wss(target_wav, pred_wav, fs) + wss_dist_vec = sorted(wss_dist_vec, reverse=False) + wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))]) + + # Compute the SSNR + snr_mean, segsnr_mean = SSNR(target_wav, pred_wav, fs) + segSNR = np.mean(segsnr_mean) + + # Compute the PESQ + pesq_raw = pesq(fs, target_wav, pred_wav, 'wb') + + # Cbak + Cbak = 1.634 + 0.478 * pesq_raw - 0.007 * wss_dist + 0.063 * segSNR + Cbak = trim_mos(Cbak) + + return Cbak + diff --git a/scores/covl.py b/scores/covl.py new file mode 100644 index 0000000000000000000000000000000000000000..abffb21a041cbb5deec966076da856d58128a15d --- /dev/null +++ b/scores/covl.py @@ -0,0 +1,39 @@ +from basis import ScoreBasis +import numpy as np +from pesq import pesq +from scores.helper import wss, llr, SSNR, trim_mos + +class COVL(ScoreBasis): + def __init__(self): + super(COVL, self).__init__(name='COVL') + self.score_rate = 16000 + self.intrusive = False + + def windowed_scoring(self, audios, score_rate): + if len(audios) != 2: + raise ValueError('COVL needs a reference and a test signals.') + return cal_COVL(audios[0], audios[1], score_rate) + +def cal_COVL(target_wav, pred_wav, fs): + alpha = 0.95 + + # Compute WSS measure + wss_dist_vec = wss(target_wav, pred_wav, fs) + wss_dist_vec = sorted(wss_dist_vec, reverse=False) + wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))]) + + # Compute LLR measure + LLR_dist = llr(target_wav, pred_wav, fs) + LLR_dist = sorted(LLR_dist, reverse=False) + LLRs = LLR_dist + LLR_len = round(len(LLR_dist) * alpha) + llr_mean = np.mean(LLRs[:LLR_len]) + + # Compute the PESQ + pesq_raw = pesq(fs, target_wav, pred_wav, 'wb') + + # Covl + Covl = 1.594 + 0.805 * pesq_raw - 0.512 * llr_mean - 0.007 * wss_dist + Covl = trim_mos(Covl) + + return Covl diff --git a/scores/csig.py b/scores/csig.py new file mode 100644 index 0000000000000000000000000000000000000000..5bc5579aefd7a40a641f53a539eaddb46f553794 --- /dev/null +++ b/scores/csig.py @@ -0,0 +1,38 @@ +from basis import ScoreBasis +import numpy as np +from pesq import pesq +from scores.helper import wss, llr, SSNR, trim_mos + +class CSIG(ScoreBasis): + def __init__(self): + super(CSIG, self).__init__(name='CSIG') + self.score_rate = 16000 + + def windowed_scoring(self, audios, score_rate): + if len(audios) != 2: + raise ValueError('CSIG needs a reference and a test signals.') + return cal_CSIG(audios[0], audios[1], score_rate) + +def cal_CSIG(target_wav, pred_wav, fs): + alpha = 0.95 + + # Compute WSS measure + wss_dist_vec = wss(target_wav, pred_wav, fs) + wss_dist_vec = sorted(wss_dist_vec, reverse=False) + wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))]) + + # Compute LLR measure + LLR_dist = llr(target_wav, pred_wav, fs) + LLR_dist = sorted(LLR_dist, reverse=False) + LLRs = LLR_dist + LLR_len = round(len(LLR_dist) * alpha) + llr_mean = np.mean(LLRs[:LLR_len]) + + # Compute the PESQ + pesq_raw = pesq(fs, target_wav, pred_wav, 'wb') + + # Csig + Csig = 3.093 - 1.029 * llr_mean + 0.603 * pesq_raw - 0.009 * wss_dist + Csig = float(trim_mos(Csig)) + + return Csig diff --git a/scores/dnsmos/DNSMOS/bak_ovr.onnx b/scores/dnsmos/DNSMOS/bak_ovr.onnx new file mode 100644 index 0000000000000000000000000000000000000000..7c3f8e41619d513ae3d9983729841509eebe4cbc --- /dev/null +++ b/scores/dnsmos/DNSMOS/bak_ovr.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f335c90994618150192a656a474bcf8a9cbcedbc47965494ba8da79605d1308 +size 742375 diff --git a/scores/dnsmos/DNSMOS/model_v8.onnx b/scores/dnsmos/DNSMOS/model_v8.onnx new file mode 100644 index 0000000000000000000000000000000000000000..0e04b14824c4dfc6af9d62040c92c09da56f21e7 --- /dev/null +++ b/scores/dnsmos/DNSMOS/model_v8.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9246480c58567bc6affd4200938e77eef49468c8bc7ed3776d109c07456f6e91 +size 224860 diff --git a/scores/dnsmos/DNSMOS/sig.onnx b/scores/dnsmos/DNSMOS/sig.onnx new file mode 100644 index 0000000000000000000000000000000000000000..eaa69f859d01dcbe66fe6a02f9b696f312f971e3 --- /dev/null +++ b/scores/dnsmos/DNSMOS/sig.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2fbdb293bc2366dfbae2b7477c490f981d24a8b4405efd3c11787569c6549d7 +size 742203 diff --git a/scores/dnsmos/DNSMOS/sig_bak_ovr.onnx b/scores/dnsmos/DNSMOS/sig_bak_ovr.onnx new file mode 100644 index 0000000000000000000000000000000000000000..81f885c678aefcf76de1f00fbc80167aa1ca1d96 --- /dev/null +++ b/scores/dnsmos/DNSMOS/sig_bak_ovr.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:269fbebdb513aa23cddfbb593542ecc540284a91849ac50516870e1ac78f6edd +size 1157965 diff --git a/scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc b/scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7687934282d311409dfe34c0db9bbcb64244aa98 Binary files /dev/null and b/scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc differ diff --git a/scores/dnsmos/dnsmos.py b/scores/dnsmos/dnsmos.py new file mode 100644 index 0000000000000000000000000000000000000000..100b0f813c17078c356db8271438539c8ea89054 --- /dev/null +++ b/scores/dnsmos/dnsmos.py @@ -0,0 +1,94 @@ +import os + +import librosa +import numpy as np +import numpy.polynomial.polynomial as poly +import onnxruntime as ort +import soundfile as sf + +SAMPLING_RATE = 16000 +INPUT_LENGTH = 9.01 + +from basis import ScoreBasis + + +class DNSMOS(ScoreBasis): + def __init__(self): + super(DNSMOS, self).__init__(name='DNSMOS') + self.intrusive = True + self.score_rate = 16000 + self.p808_model_path = os.path.join('scores/dnsmos/DNSMOS', 'model_v8.onnx') + self.primary_model_path = os.path.join('scores/dnsmos/DNSMOS', 'sig_bak_ovr.onnx') + self.compute_score = ComputeScore(self.primary_model_path, self.p808_model_path) + + def windowed_scoring(self, audios, rate): + if len(audios) == 2: + return self.compute_score.cal_mos(audios[1], rate) + else: + return self.compute_score.cal_mos(audios[0], rate) + +class ComputeScore: + def __init__(self, primary_model_path, p808_model_path) -> None: + self.onnx_sess = ort.InferenceSession(primary_model_path) + self.p808_onnx_sess = ort.InferenceSession(p808_model_path) + + def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True): + mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels) + if to_db: + mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40 + return mel_spec.T + + def get_polyfit_val(self, sig, bak, ovr): + p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535]) + p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439 ]) + p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546]) + + sig_poly = p_sig(sig) + bak_poly = p_bak(bak) + ovr_poly = p_ovr(ovr) + + return sig_poly, bak_poly, ovr_poly + + def cal_mos(self, audio, sampling_rate): + fs = sampling_rate + actual_audio_len = len(audio) + len_samples = int(INPUT_LENGTH*fs) + while len(audio) < len_samples: + audio = np.append(audio, audio) + + num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1 + hop_len_samples = fs + predicted_mos_sig_seg_raw = [] + predicted_mos_bak_seg_raw = [] + predicted_mos_ovr_seg_raw = [] + predicted_mos_sig_seg = [] + predicted_mos_bak_seg = [] + predicted_mos_ovr_seg = [] + predicted_p808_mos = [] + + for idx in range(num_hops): + audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)] + if len(audio_seg) < len_samples: + continue + + input_features = np.array(audio_seg).astype('float32')[np.newaxis,:] + p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :] + oi = {'input_1': input_features} + p808_oi = {'input_1': p808_input_features} + p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0] + mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0] + mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw,mos_bak_raw,mos_ovr_raw) + predicted_mos_sig_seg_raw.append(mos_sig_raw) + predicted_mos_bak_seg_raw.append(mos_bak_raw) + predicted_mos_ovr_seg_raw.append(mos_ovr_raw) + predicted_mos_sig_seg.append(mos_sig) + predicted_mos_bak_seg.append(mos_bak) + predicted_mos_ovr_seg.append(mos_ovr) + predicted_p808_mos.append(p808_mos) + + results = {} + results['OVRL'] = np.mean(predicted_mos_ovr_seg) + results['SIG'] = np.mean(predicted_mos_sig_seg) + results['BAK'] = np.mean(predicted_mos_bak_seg) + results['P808_MOS'] = np.mean(predicted_p808_mos) + return results diff --git a/scores/fwsegsnr.py b/scores/fwsegsnr.py new file mode 100644 index 0000000000000000000000000000000000000000..ae41fa1de9ced547e1f79c803cd9daa1962d07d2 --- /dev/null +++ b/scores/fwsegsnr.py @@ -0,0 +1,49 @@ +import librosa +import numpy as np +from basis import ScoreBasis + +class FWSEGSNR(ScoreBasis): + def __init__(self): + super(FWSEGSNR, self).__init__(name='FWSEGSNR') + self.intrusive = False + + def windowed_scoring(self, audios, score_rate): + if len(audios) != 2: + raise ValueError('FWSEGSNR needs a reference and a test signals.') + return fwsegsnr(audios[1], audios[0], score_rate) + +def fwsegsnr(x, y, fs, frame_sz = 0.025, shift_sz= 0.01, win='hann', numband=23): + epsilon = np.finfo(np.float32).eps + frame = int(np.fix(frame_sz * fs)) + shift = int(np.fix(shift_sz * fs)) + window = win + nband = numband + noverlap = frame - shift + fftpt = int(2**np.ceil(np.log2(np.abs(frame)))) + x = x / np.sqrt(sum(np.power(x, 2))) + y = y / np.sqrt(sum(np.power(y, 2))) + + assert len(x) == len(y), print('Wav length are not matched!') + X_stft = np.abs(librosa.stft(x, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False)) + Y_stft = np.abs(librosa.stft(y, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False)) + + num_freq = X_stft.shape[0] + num_frame = X_stft.shape[1] + + X_mel = librosa.feature.melspectrogram(S=X_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2) + Y_mel = librosa.feature.melspectrogram(S=Y_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2) + + # Calculate SNR. + + W = np.power(Y_mel, 0.2) + E = X_mel - Y_mel + E[E == 0.0] = epsilon + E_power = np.power(E, 2) + Y_div_E = np.divide((np.power(Y_mel,2)), (np.power(E,2))) + Y_div_E[Y_div_E==0] = epsilon + ds = 10 * np.divide(np.sum(np.multiply(W, np.log10(Y_div_E)), 1), np.sum(W, 1)) + ds[ds > 35] = 35 + ds[ds < -10] = -10 + d = np.mean(ds) + return d + diff --git a/scores/helper.py b/scores/helper.py new file mode 100644 index 0000000000000000000000000000000000000000..e34ba2be06d21b34aff666be23d3c78493e8b764 --- /dev/null +++ b/scores/helper.py @@ -0,0 +1,307 @@ +""" +Modifications in Metrics + +# Original copyright: +# Copyright (c) Facebook, Inc. and its affiliates. +# Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez +""" +import numpy as np +from scipy.linalg import toeplitz + +# ----------------------------- HELPERS ------------------------------------ # +def trim_mos(val): + return min(max(val, 1), 5) + +def lpcoeff(speech_frame, model_order): + # (1) Compute Autocor lags + winlength = speech_frame.shape[0] + R = [] + for k in range(model_order + 1): + first = speech_frame[:(winlength - k)] + second = speech_frame[k:winlength] + R.append(np.sum(first * second)) + + # (2) Lev-Durbin + a = np.ones((model_order,)) + E = np.zeros((model_order + 1,)) + rcoeff = np.zeros((model_order,)) + E[0] = R[0] + for i in range(model_order): + if i == 0: + sum_term = 0 + else: + a_past = a[:i] + sum_term = np.sum(a_past * np.array(R[i:0:-1])) + rcoeff[i] = (R[i+1] - sum_term)/E[i] + a[i] = rcoeff[i] + if i > 0: + a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1] + E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i] + acorr = np.array(R, dtype=np.float32) + refcoeff = np.array(rcoeff, dtype=np.float32) + a = a * -1 + lpparams = np.array([1] + list(a), dtype=np.float32) + acorr = np.array(acorr, dtype=np.float32) + refcoeff = np.array(refcoeff, dtype=np.float32) + lpparams = np.array(lpparams, dtype=np.float32) + + return acorr, refcoeff, lpparams +# -------------------------------------------------------------------------- # + + +def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10): + """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure + This function implements the segmental signal-to-noise ratio + as defined in [1, p. 45] (see Equation 2.12). + """ + clean_speech = ref_wav + processed_speech = deg_wav + clean_length = ref_wav.shape[0] + processed_length = deg_wav.shape[0] + + # scale both to have same dynamic range. Remove DC too. + clean_speech -= clean_speech.mean() + processed_speech -= processed_speech.mean() + processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech))) + + # Signal-to-Noise Ratio + dif = ref_wav - deg_wav + overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) + + 10e-20)) + # global variables + winlength = int(np.round(30 * srate / 1000)) # 30 msecs + skiprate = winlength // 4 + MIN_SNR = -10 + MAX_SNR = 35 + + # For each frame, calculate SSNR + num_frames = int(clean_length / skiprate - (winlength/skiprate)) + start = 0 + time = np.linspace(1, winlength, winlength) / (winlength + 1) + window = 0.5 * (1 - np.cos(2 * np.pi * time)) + segmental_snr = [] + + for frame_count in range(int(num_frames)): + # (1) get the frames for the test and ref speech. + # Apply Hanning Window + clean_frame = clean_speech[start:start+winlength] + processed_frame = processed_speech[start:start+winlength] + clean_frame = clean_frame * window + processed_frame = processed_frame * window + + # (2) Compute Segmental SNR + signal_energy = np.sum(clean_frame ** 2) + noise_energy = np.sum((clean_frame - processed_frame) ** 2) + segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps)) + segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR) + segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR) + start += int(skiprate) + return overall_snr, segmental_snr + + +def wss(ref_wav, deg_wav, srate): + clean_speech = ref_wav + processed_speech = deg_wav + clean_length = ref_wav.shape[0] + processed_length = deg_wav.shape[0] + + assert clean_length == processed_length, clean_length + + winlength = round(30 * srate / 1000.) # 240 wlen in samples + skiprate = np.floor(winlength / 4) + max_freq = srate / 2 + num_crit = 25 # num of critical bands + + USE_FFT_SPECTRUM = 1 + n_fft = int(2 ** np.ceil(np.log(2*winlength)/np.log(2))) + n_fftby2 = int(n_fft / 2) + Kmax = 20 + Klocmax = 1 + + # Critical band filter definitions (Center frequency and BW in Hz) + cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372, + 703.378, 798.717, 904.128, 1020.38, 1148.30, + 1288.72, 1442.54, 1610.70, 1794.16, 1993.93, + 2211.08, 2446.71, 2701.97, 2978.04, 3276.17, + 3597.63] + bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056, + 95.3398, 105.411, 116.256, 127.914, 140.423, + 153.823, 168.154, 183.457, 199.776, 217.153, + 235.631, 255.255, 276.072, 298.126, 321.465, + 346.136] + + bw_min = bandwidth[0] # min critical bandwidth + + # set up critical band filters. Note here that Gaussianly shaped filters + # are used. Also, the sum of the filter weights are equivalent for each + # critical band filter. Filter less than -30 dB and set to zero. + min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter + + crit_filter = np.zeros((num_crit, n_fftby2)) + all_f0 = [] + for i in range(num_crit): + f0 = (cent_freq[i] / max_freq) * (n_fftby2) + all_f0.append(np.floor(f0)) + bw = (bandwidth[i] / max_freq) * (n_fftby2) + norm_factor = np.log(bw_min) - np.log(bandwidth[i]) + j = list(range(n_fftby2)) + crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \ + norm_factor) + crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \ + min_factor) + + # For each frame of input speech, compute Weighted Spectral Slope Measure + num_frames = int(clean_length / skiprate - (winlength / skiprate)) + start = 0 # starting sample + time = np.linspace(1, winlength, winlength) / (winlength + 1) + window = 0.5 * (1 - np.cos(2 * np.pi * time)) + distortion = [] + + for frame_count in range(num_frames): + # (1) Get the Frames for the test and reference speeech. + # Multiply by Hanning window. + clean_frame = clean_speech[start:start+winlength] + processed_frame = processed_speech[start:start+winlength] + clean_frame = clean_frame * window + processed_frame = processed_frame * window + + # (2) Compuet Power Spectrum of clean and processed + clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2) + processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2) + clean_energy = [None] * num_crit + processed_energy = [None] * num_crit + + # (3) Compute Filterbank output energies (in dB) + for i in range(num_crit): + clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \ + crit_filter[i, :]) + processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \ + crit_filter[i, :]) + clean_energy = np.array(clean_energy).reshape(-1, 1) + eps = np.ones((clean_energy.shape[0], 1)) * 1e-10 + clean_energy = np.concatenate((clean_energy, eps), axis=1) + clean_energy = 10 * np.log10(np.max(clean_energy, axis=1)) + processed_energy = np.array(processed_energy).reshape(-1, 1) + processed_energy = np.concatenate((processed_energy, eps), axis=1) + processed_energy = 10 * np.log10(np.max(processed_energy, axis=1)) + + # (4) Compute Spectral Shape (dB[i+1] - dB[i]) + clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1] + processed_slope = processed_energy[1:num_crit] - \ + processed_energy[:num_crit-1] + + # (5) Find the nearest peak locations in the spectra to each + # critical band. If the slope is negative, we search + # to the left. If positive, we search to the right. + clean_loc_peak = [] + processed_loc_peak = [] + for i in range(num_crit - 1): + if clean_slope[i] > 0: + # search to the right + n = i + while n < num_crit - 1 and clean_slope[n] > 0: + n += 1 + clean_loc_peak.append(clean_energy[n - 1]) + else: + # search to the left + n = i + while n >= 0 and clean_slope[n] <= 0: + n -= 1 + clean_loc_peak.append(clean_energy[n + 1]) + # find the peaks in the processed speech signal + if processed_slope[i] > 0: + n = i + while n < num_crit - 1 and processed_slope[n] > 0: + n += 1 + processed_loc_peak.append(processed_energy[n - 1]) + else: + n = i + while n >= 0 and processed_slope[n] <= 0: + n -= 1 + processed_loc_peak.append(processed_energy[n + 1]) + + # (6) Compuet the WSS Measure for this frame. This includes + # determination of the weighting functino + dBMax_clean = max(clean_energy) + dBMax_processed = max(processed_energy) + + # The weights are calculated by averaging individual + # weighting factors from the clean and processed frame. + # These weights W_clean and W_processed should range + # from 0 to 1 and place more emphasis on spectral + # peaks and less emphasis on slope differences in spectral + # valleys. This procedure is described on page 1280 of + # Klatt's 1982 ICASSP paper. + clean_loc_peak = np.array(clean_loc_peak) + processed_loc_peak = np.array(processed_loc_peak) + Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1]) + Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \ + clean_energy[:num_crit-1]) + W_clean = Wmax_clean * Wlocmax_clean + Wmax_processed = Kmax / (Kmax + dBMax_processed - \ + processed_energy[:num_crit-1]) + Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \ + processed_energy[:num_crit-1]) + W_processed = Wmax_processed * Wlocmax_processed + W = (W_clean + W_processed) / 2 + distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \ + processed_slope[:num_crit - 1]) ** 2)) + + # this normalization is not part of Klatt's paper, but helps + # to normalize the meaasure. Here we scale the measure by the sum of the + # weights + distortion[frame_count] = distortion[frame_count] / np.sum(W) + start += int(skiprate) + return distortion + + +def llr(ref_wav, deg_wav, srate): + clean_speech = ref_wav + processed_speech = deg_wav + clean_length = ref_wav.shape[0] + processed_length = deg_wav.shape[0] + assert clean_length == processed_length, clean_length + + winlength = round(30 * srate / 1000.) # 240 wlen in samples + skiprate = np.floor(winlength / 4) + if srate < 10000: + # LPC analysis order + P = 10 + else: + P = 16 + + # For each frame of input speech, calculate the Log Likelihood Ratio + num_frames = int(clean_length / skiprate - (winlength / skiprate)) + start = 0 + time = np.linspace(1, winlength, winlength) / (winlength + 1) + window = 0.5 * (1 - np.cos(2 * np.pi * time)) + distortion = [] + + for frame_count in range(num_frames): + # (1) Get the Frames for the test and reference speeech. + # Multiply by Hanning window. + clean_frame = clean_speech[start:start+winlength] + processed_frame = processed_speech[start:start+winlength] + clean_frame = clean_frame * window + processed_frame = processed_frame * window + + # (2) Get the autocorrelation logs and LPC params used + # to compute the LLR measure + R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P) + R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P) + A_clean = A_clean[None, :] + A_processed = A_processed[None, :] + + # (3) Compute the LLR measure + numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T) + denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T) + + if (numerator/denominator) <= 0: + print(f'Numerator: {numerator}') + print(f'Denominator: {denominator}') + + log_ = np.log(numerator / denominator) + distortion.append(np.squeeze(log_)) + start += int(skiprate) + return np.nan_to_num(np.array(distortion)) +# -------------------------------------------------------------------------- # diff --git a/scores/helper_bk.py b/scores/helper_bk.py new file mode 100644 index 0000000000000000000000000000000000000000..a44c3010027808a509cb72998e38386e54e82f35 --- /dev/null +++ b/scores/helper_bk.py @@ -0,0 +1,438 @@ +""" +Modifications in Metrics + +# Original copyright: +# Copyright (c) Facebook, Inc. and its affiliates. +# Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez +""" +import numpy as np +from scipy.linalg import toeplitz + +# ----------------------------- HELPERS ------------------------------------ # +def trim_mos(val): + return min(max(val, 1), 5) + +def lpcoeff(speech_frame, model_order): + # (1) Compute Autocor lags + winlength = speech_frame.shape[0] + R = [] + for k in range(model_order + 1): + first = speech_frame[:(winlength - k)] + second = speech_frame[k:winlength] + R.append(np.sum(first * second)) + + # (2) Lev-Durbin + a = np.ones((model_order,)) + E = np.zeros((model_order + 1,)) + rcoeff = np.zeros((model_order,)) + E[0] = R[0] + for i in range(model_order): + if i == 0: + sum_term = 0 + else: + a_past = a[:i] + sum_term = np.sum(a_past * np.array(R[i:0:-1])) + rcoeff[i] = (R[i+1] - sum_term)/E[i] + a[i] = rcoeff[i] + if i > 0: + a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1] + E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i] + acorr = np.array(R, dtype=np.float32) + refcoeff = np.array(rcoeff, dtype=np.float32) + a = a * -1 + lpparams = np.array([1] + list(a), dtype=np.float32) + acorr = np.array(acorr, dtype=np.float32) + refcoeff = np.array(refcoeff, dtype=np.float32) + lpparams = np.array(lpparams, dtype=np.float32) + + return acorr, refcoeff, lpparams +# -------------------------------------------------------------------------- # + + +def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10): + """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure + This function implements the segmental signal-to-noise ratio + as defined in [1, p. 45] (see Equation 2.12). + """ + clean_speech = ref_wav + processed_speech = deg_wav + clean_length = ref_wav.shape[0] + processed_length = deg_wav.shape[0] + + # scale both to have same dynamic range. Remove DC too. + clean_speech -= clean_speech.mean() + processed_speech -= processed_speech.mean() + processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech))) + + # Signal-to-Noise Ratio + dif = ref_wav - deg_wav + overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) + + 10e-20)) + # global variables + winlength = int(np.round(30 * srate / 1000)) # 30 msecs + skiprate = winlength // 4 + MIN_SNR = -10 + MAX_SNR = 35 + + # For each frame, calculate SSNR + num_frames = int(clean_length / skiprate - (winlength/skiprate)) + start = 0 + time = np.linspace(1, winlength, winlength) / (winlength + 1) + window = 0.5 * (1 - np.cos(2 * np.pi * time)) + segmental_snr = [] + + for frame_count in range(int(num_frames)): + # (1) get the frames for the test and ref speech. + # Apply Hanning Window + clean_frame = clean_speech[start:start+winlength] + processed_frame = processed_speech[start:start+winlength] + clean_frame = clean_frame * window + processed_frame = processed_frame * window + + # (2) Compute Segmental SNR + signal_energy = np.sum(clean_frame ** 2) + noise_energy = np.sum((clean_frame - processed_frame) ** 2) + segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps)) + segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR) + segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR) + start += int(skiprate) + return overall_snr, segmental_snr + + +def wss(ref_wav, deg_wav, srate): + clean_speech = ref_wav + processed_speech = deg_wav + clean_length = ref_wav.shape[0] + processed_length = deg_wav.shape[0] + + assert clean_length == processed_length, clean_length + + winlength = round(30 * srate / 1000.) # 240 wlen in samples + skiprate = np.floor(winlength / 4) + max_freq = srate / 2 + num_crit = 25 # num of critical bands + + USE_FFT_SPECTRUM = 1 + n_fft = int(2 ** np.ceil(np.log(2*winlength)/np.log(2))) + n_fftby2 = int(n_fft / 2) + Kmax = 20 + Klocmax = 1 + + # Critical band filter definitions (Center frequency and BW in Hz) + cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372, + 703.378, 798.717, 904.128, 1020.38, 1148.30, + 1288.72, 1442.54, 1610.70, 1794.16, 1993.93, + 2211.08, 2446.71, 2701.97, 2978.04, 3276.17, + 3597.63] + bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056, + 95.3398, 105.411, 116.256, 127.914, 140.423, + 153.823, 168.154, 183.457, 199.776, 217.153, + 235.631, 255.255, 276.072, 298.126, 321.465, + 346.136] + + bw_min = bandwidth[0] # min critical bandwidth + + # set up critical band filters. Note here that Gaussianly shaped filters + # are used. Also, the sum of the filter weights are equivalent for each + # critical band filter. Filter less than -30 dB and set to zero. + min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter + + crit_filter = np.zeros((num_crit, n_fftby2)) + all_f0 = [] + for i in range(num_crit): + f0 = (cent_freq[i] / max_freq) * (n_fftby2) + all_f0.append(np.floor(f0)) + bw = (bandwidth[i] / max_freq) * (n_fftby2) + norm_factor = np.log(bw_min) - np.log(bandwidth[i]) + j = list(range(n_fftby2)) + crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \ + norm_factor) + crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \ + min_factor) + + # For each frame of input speech, compute Weighted Spectral Slope Measure + num_frames = int(clean_length / skiprate - (winlength / skiprate)) + start = 0 # starting sample + time = np.linspace(1, winlength, winlength) / (winlength + 1) + window = 0.5 * (1 - np.cos(2 * np.pi * time)) + distortion = [] + + for frame_count in range(num_frames): + # (1) Get the Frames for the test and reference speeech. + # Multiply by Hanning window. + clean_frame = clean_speech[start:start+winlength] + processed_frame = processed_speech[start:start+winlength] + clean_frame = clean_frame * window + processed_frame = processed_frame * window + + # (2) Compuet Power Spectrum of clean and processed + clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2) + processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2) + clean_energy = [None] * num_crit + processed_energy = [None] * num_crit + + # (3) Compute Filterbank output energies (in dB) + for i in range(num_crit): + clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \ + crit_filter[i, :]) + processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \ + crit_filter[i, :]) + clean_energy = np.array(clean_energy).reshape(-1, 1) + eps = np.ones((clean_energy.shape[0], 1)) * 1e-10 + clean_energy = np.concatenate((clean_energy, eps), axis=1) + clean_energy = 10 * np.log10(np.max(clean_energy, axis=1)) + processed_energy = np.array(processed_energy).reshape(-1, 1) + processed_energy = np.concatenate((processed_energy, eps), axis=1) + processed_energy = 10 * np.log10(np.max(processed_energy, axis=1)) + + # (4) Compute Spectral Shape (dB[i+1] - dB[i]) + clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1] + processed_slope = processed_energy[1:num_crit] - \ + processed_energy[:num_crit-1] + + # (5) Find the nearest peak locations in the spectra to each + # critical band. If the slope is negative, we search + # to the left. If positive, we search to the right. + clean_loc_peak = [] + processed_loc_peak = [] + for i in range(num_crit - 1): + if clean_slope[i] > 0: + # search to the right + n = i + while n < num_crit - 1 and clean_slope[n] > 0: + n += 1 + clean_loc_peak.append(clean_energy[n - 1]) + else: + # search to the left + n = i + while n >= 0 and clean_slope[n] <= 0: + n -= 1 + clean_loc_peak.append(clean_energy[n + 1]) + # find the peaks in the processed speech signal + if processed_slope[i] > 0: + n = i + while n < num_crit - 1 and processed_slope[n] > 0: + n += 1 + processed_loc_peak.append(processed_energy[n - 1]) + else: + n = i + while n >= 0 and processed_slope[n] <= 0: + n -= 1 + processed_loc_peak.append(processed_energy[n + 1]) + + # (6) Compuet the WSS Measure for this frame. This includes + # determination of the weighting functino + dBMax_clean = max(clean_energy) + dBMax_processed = max(processed_energy) + + # The weights are calculated by averaging individual + # weighting factors from the clean and processed frame. + # These weights W_clean and W_processed should range + # from 0 to 1 and place more emphasis on spectral + # peaks and less emphasis on slope differences in spectral + # valleys. This procedure is described on page 1280 of + # Klatt's 1982 ICASSP paper. + clean_loc_peak = np.array(clean_loc_peak) + processed_loc_peak = np.array(processed_loc_peak) + Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1]) + Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \ + clean_energy[:num_crit-1]) + W_clean = Wmax_clean * Wlocmax_clean + Wmax_processed = Kmax / (Kmax + dBMax_processed - \ + processed_energy[:num_crit-1]) + Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \ + processed_energy[:num_crit-1]) + W_processed = Wmax_processed * Wlocmax_processed + W = (W_clean + W_processed) / 2 + distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \ + processed_slope[:num_crit - 1]) ** 2)) + + # this normalization is not part of Klatt's paper, but helps + # to normalize the meaasure. Here we scale the measure by the sum of the + # weights + distortion[frame_count] = distortion[frame_count] / np.sum(W) + start += int(skiprate) + return distortion + + +def llr(ref_wav, deg_wav, srate): + clean_speech = ref_wav + processed_speech = deg_wav + clean_length = ref_wav.shape[0] + processed_length = deg_wav.shape[0] + assert clean_length == processed_length, clean_length + + winlength = round(30 * srate / 1000.) # 240 wlen in samples + skiprate = np.floor(winlength / 4) + if srate < 10000: + # LPC analysis order + P = 10 + else: + P = 16 + + # For each frame of input speech, calculate the Log Likelihood Ratio + num_frames = int(clean_length / skiprate - (winlength / skiprate)) + start = 0 + time = np.linspace(1, winlength, winlength) / (winlength + 1) + window = 0.5 * (1 - np.cos(2 * np.pi * time)) + distortion = [] + + for frame_count in range(num_frames): + # (1) Get the Frames for the test and reference speeech. + # Multiply by Hanning window. + clean_frame = clean_speech[start:start+winlength] + processed_frame = processed_speech[start:start+winlength] + clean_frame = clean_frame * window + processed_frame = processed_frame * window + + # (2) Get the autocorrelation logs and LPC params used + # to compute the LLR measure + R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P) + R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P) + A_clean = A_clean[None, :] + A_processed = A_processed[None, :] + + # (3) Compute the LLR measure + numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T) + denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T) + + if (numerator/denominator) <= 0: + print(f'Numerator: {numerator}') + print(f'Denominator: {denominator}') + + log_ = np.log(numerator / denominator) + distortion.append(np.squeeze(log_)) + start += int(skiprate) + return np.nan_to_num(np.array(distortion)) +# -------------------------------------------------------------------------- # + +#!/usr/bin/env python3 + +# Copyright 2020 Wen-Chin Huang and Tomoki Hayashi +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# ported from https://github.com/espnet/espnet/blob/master/utils/mcd_calculate.py + +"""Evaluate MCD between generated and groundtruth audios with SPTK-based mcep.""" + +from typing import Tuple + +import numpy as np +import pysptk +from fastdtw import fastdtw +from scipy import spatial + + +def sptk_extract( + x: np.ndarray, + fs: int, + n_fft: int = 512, + n_shift: int = 256, + mcep_dim: int = 25, + mcep_alpha: float = 0.41, + is_padding: bool = False, +) -> np.ndarray: + """Extract SPTK-based mel-cepstrum. + + Args: + x (ndarray): 1D waveform array. + fs (int): Sampling rate + n_fft (int): FFT length in point (default=512). + n_shift (int): Shift length in point (default=256). + mcep_dim (int): Dimension of mel-cepstrum (default=25). + mcep_alpha (float): All pass filter coefficient (default=0.41). + is_padding (bool): Whether to pad the end of signal (default=False). + + Returns: + ndarray: Mel-cepstrum with the size (N, n_fft). + + """ + # perform padding + if is_padding: + n_pad = n_fft - (len(x) - n_fft) % n_shift + x = np.pad(x, (0, n_pad), "reflect") + + # get number of frames + n_frame = (len(x) - n_fft) // n_shift + 1 + + # get window function + win = pysptk.sptk.hamming(n_fft) + + # check mcep and alpha + if mcep_dim is None or mcep_alpha is None: + mcep_dim, mcep_alpha = _get_best_mcep_params(fs) + + # calculate spectrogram + mcep = [ + pysptk.mcep( + x[n_shift * i : n_shift * i + n_fft] * win, + mcep_dim, + mcep_alpha, + eps=1e-6, + etype=1, + ) + for i in range(n_frame) + ] + + return np.stack(mcep) + + +def _get_best_mcep_params(fs: int) -> Tuple[int, float]: + # https://sp-nitech.github.io/sptk/latest/main/mgcep.html#_CPPv4N4sptk19MelCepstralAnalysisE + if fs == 8000: + return 13, 0.31 + elif fs == 16000: + return 23, 0.42 + elif fs == 22050: + return 34, 0.45 + elif fs == 24000: + return 34, 0.46 + elif fs == 32000: + return 36, 0.50 + elif fs == 44100: + return 39, 0.53 + elif fs == 48000: + return 39, 0.55 + else: + raise ValueError(f"Not found the setting for {fs}.") + + +def calculate_mcd( + inf_audio, + ref_audio, + fs, + n_fft=1024, + n_shift=256, + mcep_dim=None, + mcep_alpha=None, +): + """Calculate MCD.""" + + # extract ground truth and converted features + gen_mcep = sptk_extract( + x=inf_audio, + fs=fs, + n_fft=n_fft, + n_shift=n_shift, + mcep_dim=mcep_dim, + mcep_alpha=mcep_alpha, + ) + gt_mcep = sptk_extract( + x=ref_audio, + fs=fs, + n_fft=n_fft, + n_shift=n_shift, + mcep_dim=mcep_dim, + mcep_alpha=mcep_alpha, + ) + + # DTW + _, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean) + twf = np.array(path).T + gen_mcep_dtw = gen_mcep[twf[0]] + gt_mcep_dtw = gt_mcep[twf[1]] + + # MCD + diff2sum = np.sum((gen_mcep_dtw - gt_mcep_dtw) ** 2, 1) + mcd = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0) + + return mcd diff --git a/scores/llr.py b/scores/llr.py new file mode 100644 index 0000000000000000000000000000000000000000..072b47eeecbc119586814313b4bc050b1eca6cdb --- /dev/null +++ b/scores/llr.py @@ -0,0 +1,66 @@ +from basis import ScoreBasis +import numpy as np +from scipy.linalg import toeplitz +from scores.helper import lpcoeff + +class LLR(ScoreBasis): + def __init__(self): + super(LLR, self).__init__(name='LLR') + self.intrusive = False + + def windowed_scoring(self, audios, score_rate): + if len(audios) != 2: + raise ValueError('LLR needs a reference and a test signals.') + return cal_LLR(audios[0], audios[1], score_rate) + +def cal_LLR(ref_wav, deg_wav, srate): + # obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py + clean_speech = ref_wav + processed_speech = deg_wav + clean_length = ref_wav.shape[0] + processed_length = deg_wav.shape[0] + assert clean_length == processed_length, clean_length + + winlength = round(30 * srate / 1000.) # 240 wlen in samples + skiprate = np.floor(winlength / 4) + if srate < 10000: + # LPC analysis order + P = 10 + else: + P = 16 + + # For each frame of input speech, calculate the Log Likelihood Ratio + num_frames = int(clean_length / skiprate - (winlength / skiprate)) + start = 0 + time = np.linspace(1, winlength, winlength) / (winlength + 1) + window = 0.5 * (1 - np.cos(2 * np.pi * time)) + distortion = [] + + for frame_count in range(num_frames): + # (1) Get the Frames for the test and reference speeech. + # Multiply by Hanning window. + clean_frame = clean_speech[start:start+winlength] + processed_frame = processed_speech[start:start+winlength] + clean_frame = clean_frame * window + processed_frame = processed_frame * window + + # (2) Get the autocorrelation logs and LPC params used + # to compute the LLR measure + R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P) + R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P) + A_clean = A_clean[None, :] + A_processed = A_processed[None, :] + + # (3) Compute the LLR measure + numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T) + denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T) + + if (numerator/denominator) <= 0: + print(f'Numerator: {numerator}') + print(f'Denominator: {denominator}') + + log_ = np.log(numerator / denominator) + distortion.append(np.squeeze(log_)) + start += int(skiprate) + return np.mean(np.nan_to_num(np.array(distortion))) + diff --git a/scores/lsd.py b/scores/lsd.py new file mode 100644 index 0000000000000000000000000000000000000000..4d1650e4af80fd2b39a5a5b864bec8e5495042da --- /dev/null +++ b/scores/lsd.py @@ -0,0 +1,30 @@ +from basis import ScoreBasis +import numpy as np +import librosa + +EPS = 1e-12 + +class LSD(ScoreBasis): + def __init__(self): + super(LSD, self).__init__(name='LSD') + self.intrusive = False + self.mono = True + + def windowed_scoring(self, audios, score_rate): + if len(audios) != 2: + raise ValueError('NB_PESQ needs a reference and a test signals.') + est = wav_to_spectrogram(audios[1], score_rate) + target = wav_to_spectrogram(audios[0], score_rate) + return cal_LSD(est, target) + +def wav_to_spectrogram(wav, rate): + hop_length = int(rate / 100) + n_fft = int(2048 / (48000 / rate)) + spec = np.abs(librosa.stft(wav, hop_length=hop_length, n_fft=n_fft)) + spec = np.transpose(spec, (1, 0)) + return spec + +def cal_LSD(est, target): + log_ratio = np.log10(target**2 / ((est + EPS) ** 2) + EPS) ** 2 + lsd_ = np.mean(np.mean(log_ratio, axis=1) ** 0.5, axis=0) + return lsd_ diff --git a/scores/mcd.py b/scores/mcd.py new file mode 100644 index 0000000000000000000000000000000000000000..8190672cb94674d3a7873d1434bfb107c1d914a1 --- /dev/null +++ b/scores/mcd.py @@ -0,0 +1,136 @@ +from basis import ScoreBasis +import librosa +import math +import numpy as np +import pyworld +import pysptk +from fastdtw import fastdtw +from scipy.spatial.distance import euclidean +#from scores.helper import calculate_mcd +#from pymcd.mcd import Calculate_MCD +#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py +class MCD(ScoreBasis): + def __init__(self): + super(MCD, self).__init__(name='MCD') + self.intrusive = False + # three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics + self.mcd_toolbox = Calculate_MCD(MCD_mode="plain") + + def windowed_scoring(self, audios, score_rate): + if len(audios) != 2: + raise ValueError('MCD needs a reference and a test signals.') + return self.mcd_toolbox.calculate_mcd(audios[1], audios[0], score_rate) + +# ================================================= # +# calculate the Mel-Cepstral Distortion (MCD) value # +# ================================================= # +#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py +class Calculate_MCD(object): + """docstring for Calculate_MCD""" + def __init__(self, MCD_mode): + super(Calculate_MCD, self).__init__() + self.MCD_mode = MCD_mode + #self.SAMPLING_RATE = 22050 + self.FRAME_PERIOD = 5.0 + self.log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) # 6.141851463713754 + + def load_wav(self, wav_file, sample_rate): + """ + Load a wav file with librosa. + :param wav_file: path to wav file + :param sr: sampling rate + :return: audio time series numpy array + """ + wav, _ = librosa.load(wav_file, sr=sample_rate, mono=True) + return wav + + # distance metric + def log_spec_dB_dist(self, x, y): + # log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) + diff = x - y + return self.log_spec_dB_const * math.sqrt(np.inner(diff, diff)) + + # calculate distance (metric) + # def calculate_mcd_distance(self, x, y, distance, path): + def calculate_mcd_distance(self, x, y, path): + ''' + param path: pairs between x and y + ''' + pathx = list(map(lambda l: l[0], path)) + pathy = list(map(lambda l: l[1], path)) + x, y = x[pathx], y[pathy] + frames_tot = x.shape[0] # length of pairs + + z = x - y + min_cost_tot = np.sqrt((z * z).sum(-1)).sum() + + return frames_tot, min_cost_tot + + # extract acoustic features + # alpha = 0.65 # commonly used at 22050 Hz + def wav2mcep_numpy(self, loaded_wav, score_rate=22050, alpha=0.65, fft_size=512): + + # Use WORLD vocoder to spectral envelope + _, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=score_rate, + frame_period=self.FRAME_PERIOD, fft_size=fft_size) + # Extract MCEP features + mcep = pysptk.sptk.mcep(sp, order=13, alpha=alpha, maxiter=0, + etype=1, eps=1.0E-8, min_det=0.0, itype=3) + + return mcep + + # calculate the Mel-Cepstral Distortion (MCD) value + #def average_mcd(self, ref_audio_file, syn_audio_file, cost_function, MCD_mode): + def average_mcd(self, loaded_ref_wav, loaded_syn_wav, cost_function, MCD_mode, score_rate): + """ + Calculate the average MCD. + :param ref_mcep_files: list of strings, paths to MCEP target reference files + :param synth_mcep_files: list of strings, paths to MCEP converted synthesised files + :param cost_function: distance metric used + :param plain: if plain=True, use Dynamic Time Warping (dtw) + :returns: average MCD, total frames processed + """ + # load wav from given wav file + #loaded_ref_wav = self.load_wav(ref_audio_file, sample_rate=self.SAMPLING_RATE) + #loaded_syn_wav = self.load_wav(syn_audio_file, sample_rate=self.SAMPLING_RATE) + + if MCD_mode == "plain": + # pad 0 + if len(loaded_ref_wav)len(syn_mcep_vec) else len(syn_mcep_vec)/len(ref_mcep_vec) + _, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean) + + frames_tot, min_cost_tot = self.calculate_mcd_distance(ref_mcep_vec, syn_mcep_vec, path) + + if MCD_mode == "dtw_sl": + mean_mcd = cof * self.log_spec_dB_const * min_cost_tot / frames_tot + else: + mean_mcd = self.log_spec_dB_const * min_cost_tot / frames_tot + + return mean_mcd + + # calculate mcd + def calculate_mcd(self, reference_audio, synthesized_audio, score_rate): + # extract acoustic features + mean_mcd = self.average_mcd(reference_audio, synthesized_audio, self.log_spec_dB_dist, self.MCD_mode, score_rate) + + return mean_mcd diff --git a/scores/mosnet/__init__.py b/scores/mosnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ab28a6d887beb81688fca72f11f27b0dd950d570 --- /dev/null +++ b/scores/mosnet/__init__.py @@ -0,0 +1,21 @@ +def load(window, hop=None): + import tensorflow as tf + from .model import MOSNet + tf.debugging.set_log_device_placement(False) + # set memory growth + gpus = tf.config.experimental.list_physical_devices('GPU') + if gpus: + try: + # Currently, memory growth needs to be the same across GPUs + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + + logical_gpus = tf.config.experimental.list_logical_devices('GPU') + print(len(gpus), "Physical GPUs,", + len(logical_gpus), "Logical GPUs") + except RuntimeError as e: + # Memory growth must be set before GPUs have been initialized + print(e) + + mosnet = MOSNet(window, hop) + return mosnet diff --git a/scores/mosnet/__pycache__/__init__.cpython-38.pyc b/scores/mosnet/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6682098d66c7e99942959d964f1c9430174ffff5 Binary files /dev/null and b/scores/mosnet/__pycache__/__init__.cpython-38.pyc differ diff --git a/scores/mosnet/cnn_blstm.h5 b/scores/mosnet/cnn_blstm.h5 new file mode 100644 index 0000000000000000000000000000000000000000..5f28d003dcf7d8a3887b8c9227957ae9fcacef3b --- /dev/null +++ b/scores/mosnet/cnn_blstm.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78b75e7d76ee6074ea7d57dcffa56d0c90be9d3d8dedc2217e25e259423cb756 +size 14248464 diff --git a/scores/mosnet/model.py b/scores/mosnet/model.py new file mode 100644 index 0000000000000000000000000000000000000000..deb472c61ed5b811ed992446d3f8e65912497168 --- /dev/null +++ b/scores/mosnet/model.py @@ -0,0 +1,107 @@ +from tensorflow import keras +from tensorflow.keras import Model, layers +from tensorflow.keras.layers import Dense, Dropout, Conv2D +from tensorflow.keras.layers import LSTM, TimeDistributed, Bidirectional +from tensorflow.keras.constraints import max_norm +import librosa +import scipy +import numpy as np +import os +from ... import Metric + +# prevent TF warnings +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' + + +class MOSNet(Metric): + def __init__(self, window, hop=None): + super(MOSNet, self).__init__(name='MOSNet', window=window, hop=hop) + + # constants + self.fixed_rate = 16000 + self.mono = True + self.absolute = True + + self.FFT_SIZE = 512 + self.SGRAM_DIM = self.FFT_SIZE // 2 + 1 + self.HOP_LENGTH = 256 + self.WIN_LENGTH = 512 + + _input = keras.Input(shape=(None, 257)) + + re_input = layers.Reshape((-1, 257, 1), input_shape=(-1, 257))(_input) + + # CNN + conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu', + padding='same'))(re_input) + conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu', + padding='same'))(conv1) + conv1 = (Conv2D(16, (3, 3), strides=(1, 3), activation='relu', + padding='same'))(conv1) + + conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu', + padding='same'))(conv1) + conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu', + padding='same'))(conv2) + conv2 = (Conv2D(32, (3, 3), strides=(1, 3), activation='relu', + padding='same'))(conv2) + + conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu', + padding='same'))(conv2) + conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu', + padding='same'))(conv3) + conv3 = (Conv2D(64, (3, 3), strides=(1, 3), activation='relu', + padding='same'))(conv3) + + conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu', + padding='same'))(conv3) + conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu', + padding='same'))(conv4) + conv4 = (Conv2D(128, (3, 3), strides=(1, 3), activation='relu', + padding='same'))(conv4) + + re_shape = layers.Reshape((-1, 4*128), input_shape=(-1, 4, 128))(conv4) + + # BLSTM + blstm1 = Bidirectional( + LSTM(128, return_sequences=True, dropout=0.3, + recurrent_dropout=0.3, + recurrent_constraint=max_norm(0.00001)), + merge_mode='concat')(re_shape) + + # DNN + flatten = TimeDistributed(layers.Flatten())(blstm1) + dense1 = TimeDistributed(Dense(128, activation='relu'))(flatten) + dense1 = Dropout(0.3)(dense1) + + frame_score = TimeDistributed(Dense(1), name='frame')(dense1) + import warnings + + average_score = layers.GlobalAveragePooling1D(name='avg')(frame_score) + + self.model = Model(outputs=[average_score, frame_score], inputs=_input) + + # weights are in the directory of this file + pre_trained_dir = os.path.dirname(__file__) + + # load pre-trained weights. CNN_BLSTM is reported as best + self.model.load_weights(os.path.join(pre_trained_dir, 'cnn_blstm.h5')) + + def test_window(self, audios, rate): + # stft. D: (1+n_fft//2, T) + linear = librosa.stft(y=np.asfortranarray(audios[0]), + n_fft=self.FFT_SIZE, + hop_length=self.HOP_LENGTH, + win_length=self.WIN_LENGTH, + window=scipy.signal.hamming, + ) + + # magnitude spectrogram + mag = np.abs(linear) # (1+n_fft/2, T) + + # shape in (T, 1+n_fft/2) + mag = np.transpose(mag.astype(np.float32)) + + # now call the actual MOSnet + return {'mosnet': + self.model.predict(mag[None, ...], verbose=0, batch_size=1)[0]} diff --git a/scores/nb_pesq.py b/scores/nb_pesq.py new file mode 100644 index 0000000000000000000000000000000000000000..02bd009db1adbcb324fcfe087376ec0520743493 --- /dev/null +++ b/scores/nb_pesq.py @@ -0,0 +1,15 @@ +from basis import ScoreBasis + + +class NB_PESQ(ScoreBasis): + def __init__(self): + super(NB_PESQ, self).__init__(name='NB_PESQ') + self.intrusive = False + self.score_rate = 16000 + + def windowed_scoring(self, audios, score_rate): + from pypesq import pesq + if len(audios) != 2: + raise ValueError('NB_PESQ needs a reference and a test signals.') + return pesq(audios[1], audios[0], score_rate) + diff --git a/scores/pesq.py b/scores/pesq.py new file mode 100644 index 0000000000000000000000000000000000000000..ff016786fad0e5bcc98a0267c9863dd646c1d456 --- /dev/null +++ b/scores/pesq.py @@ -0,0 +1,15 @@ +from basis import ScoreBasis + +class PESQ(ScoreBasis): + def __init__(self): + super(PESQ, self).__init__(name='PESQ') + self.intrusive = False + self.mono = True + self.fixed_rate = 16000 + + def windowed_scoring(self, audios, rate): + from pesq import pesq + if len(audios) != 2: + raise ValueError('PESQ needs a reference and a test signals.') + return pesq(rate, audios[1], audios[0], 'wb') + diff --git a/scores/sisdr.py b/scores/sisdr.py new file mode 100644 index 0000000000000000000000000000000000000000..142b89f21f2ad35678fa38f66c99d01790bfc7e2 --- /dev/null +++ b/scores/sisdr.py @@ -0,0 +1,32 @@ +from basis import ScoreBasis +import numpy as np +from numpy.linalg import norm + +class SISDR(ScoreBasis): + def __init__(self): + super(SISDR, self).__init__(name='SISDR') + self.intrusive = False + + def windowed_scoring(self, audios, score_rate): + # as provided by @Jonathan-LeRoux and slightly adapted for the case of just one reference + # and one estimate. + # see original code here: https://github.com/sigsep/bsseval/issues/3#issuecomment-494995846 + if len(audios) != 2: + raise ValueError('PESQ needs a reference and a test signals.') + eps = np.finfo(audios[0].dtype).eps + reference = audios[1].reshape(audios[1].size, 1) + estimate = audios[0].reshape(audios[0].size, 1) + + Rss = np.dot(reference.T, reference) + + # get the scaling factor for clean sources + a = (eps + np.dot(reference.T, estimate)) / (Rss + eps) + + e_true = a * reference + e_res = estimate - e_true + + Sss = (e_true**2).sum() + Snn = (e_res**2).sum() + + return 10 * np.log10((eps+ Sss)/(eps + Snn)) + diff --git a/scores/snr.py b/scores/snr.py new file mode 100644 index 0000000000000000000000000000000000000000..e0818e04db37d0aa3b91be9561abaa464839f744 --- /dev/null +++ b/scores/snr.py @@ -0,0 +1,33 @@ +from basis import ScoreBasis +import numpy as np + +class SNR(ScoreBasis): + def __init__(self): + super(SNR, self).__init__(name='SNR') + self.intrusive = False + + def windowed_scoring(self, audios, score_rate): + if len(audios) != 2: + raise ValueError('SNR needs a reference and a test signals.') + return cal_SNR(audios[0], audios[1], score_rate) + +def cal_SNR(ref_wav, deg_wav, srate=16000, eps=1e-10): + # obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py + """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure + This function implements the segmental signal-to-noise ratio + as defined in [1, p. 45] (see Equation 2.12). + """ + clean_speech = ref_wav + processed_speech = deg_wav + clean_length = ref_wav.shape[0] + processed_length = deg_wav.shape[0] + + # scale both to have same dynamic range. Remove DC too. + clean_speech -= clean_speech.mean() + processed_speech -= processed_speech.mean() + processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech))) + + # Signal-to-Noise Ratio + dif = ref_wav - deg_wav + overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) + 10e-20)) + return overall_snr diff --git a/scores/srmr/LICENSE.md b/scores/srmr/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..9bbbb29ce6e21137146cc67aab4b550782dec099 --- /dev/null +++ b/scores/srmr/LICENSE.md @@ -0,0 +1,22 @@ +The SRMRpy toolbox is licensed under the MIT license. + +> Copyright (c) 2014 João F. Santos, Tiago H. Falk +> +> Permission is hereby granted, free of charge, to any person obtaining a copy +> of this software and associated documentation files (the "Software"), to deal +> in the Software without restriction, including without limitation the rights +> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +> copies of the Software, and to permit persons to whom the Software is +> furnished to do so, subject to the following conditions: +> +> The above copyright notice and this permission notice shall be included in all +> copies or substantial portions of the Software. +> +> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +> SOFTWARE. + diff --git a/scores/srmr/__pycache__/__init__.cpython-38.pyc b/scores/srmr/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2b8d048cf938935d45f0ac2772d5195f526f440 Binary files /dev/null and b/scores/srmr/__pycache__/__init__.cpython-38.pyc differ diff --git a/scores/srmr/__pycache__/cal_srmr.cpython-38.pyc b/scores/srmr/__pycache__/cal_srmr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e23a9948a5ac3ffcc774276bee441beafb0e235e Binary files /dev/null and b/scores/srmr/__pycache__/cal_srmr.cpython-38.pyc differ diff --git a/scores/srmr/__pycache__/hilbert.cpython-38.pyc b/scores/srmr/__pycache__/hilbert.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd002a203a4a92650082bca1b50345d80ff6fe51 Binary files /dev/null and b/scores/srmr/__pycache__/hilbert.cpython-38.pyc differ diff --git a/scores/srmr/__pycache__/metric_srmr.cpython-38.pyc b/scores/srmr/__pycache__/metric_srmr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b37e45deb829a7fbda422f7ab3172f39b6f86fab Binary files /dev/null and b/scores/srmr/__pycache__/metric_srmr.cpython-38.pyc differ diff --git a/scores/srmr/__pycache__/modulation_filters.cpython-38.pyc b/scores/srmr/__pycache__/modulation_filters.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0dda1afb314dbd08b200ec9d63853f0ae233b1f5 Binary files /dev/null and b/scores/srmr/__pycache__/modulation_filters.cpython-38.pyc differ diff --git a/scores/srmr/__pycache__/srmr.cpython-38.pyc b/scores/srmr/__pycache__/srmr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5069cc0060f0ed39c1969bb993ea53445fee8116 Binary files /dev/null and b/scores/srmr/__pycache__/srmr.cpython-38.pyc differ diff --git a/scores/srmr/cal_srmr.py b/scores/srmr/cal_srmr.py new file mode 100644 index 0000000000000000000000000000000000000000..38c006502d814eca69617fa193d7f39c63915b1f --- /dev/null +++ b/scores/srmr/cal_srmr.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- +# Copyright 2014 João Felipe Santos, jfsantos@emt.inrs.ca +# +# This file is part of the SRMRpy library, and is licensed under the +# MIT license: https://github.com/jfsantos/SRMRpy/blob/master/LICENSE + +from __future__ import division +import numpy as np +from scipy.signal import hamming +from .hilbert import hilbert +from .modulation_filters import compute_modulation_cfs, modulation_filterbank,\ + modfilt +from gammatone.fftweight import fft_gtgram +from gammatone.filters import centre_freqs, make_erb_filters, erb_filterbank +from srmrpy.segmentaxis import segment_axis + +from scipy.io.wavfile import read as readwav + + +def calc_erbs(low_freq, fs, n_filters): + ear_q = 9.26449 # Glasberg and Moore Parameters + min_bw = 24.7 + order = 1 + + erbs = ((centre_freqs(fs, n_filters, low_freq)/ear_q)**order + + min_bw**order)**(1/order) + return erbs + + +def calc_cutoffs(cfs, fs, q): + # Calculates cutoff frequencies (3 dB) for 2nd order bandpass + w0 = 2*np.pi*cfs/fs + B0 = np.tan(w0/2)/q + L = cfs - (B0 * fs / (2*np.pi)) + R = cfs + (B0 * fs / (2*np.pi)) + return L, R + + +def normalize_energy(energy, drange=30.0): + peak_energy = np.max(np.mean(energy, axis=0)) + min_energy = peak_energy*10.0**(-drange/10.0) + energy[energy < min_energy] = min_energy + energy[energy > peak_energy] = peak_energy + return energy + + +def cal_SRMR(x, fs, n_cochlear_filters=23, low_freq=125, min_cf=4, max_cf=128, + fast=True, norm=False): + wLengthS = .256 + wIncS = .064 + # Computing gammatone envelopes + if fast: + mfs = 400.0 + gt_env = fft_gtgram(x, fs, 0.010, 0.0025, n_cochlear_filters, low_freq) + else: + cfs = centre_freqs(fs, n_cochlear_filters, low_freq) + fcoefs = make_erb_filters(fs, cfs) + gt_env = np.abs(hilbert(erb_filterbank(x, fcoefs))) + mfs = fs + + wLength = int(np.ceil(wLengthS*mfs)) + wInc = int(np.ceil(wIncS*mfs)) + + # Computing modulation filterbank with Q = 2 and 8 channels + mod_filter_cfs = compute_modulation_cfs(min_cf, max_cf, 8) + MF = modulation_filterbank(mod_filter_cfs, mfs, 2) + + n_frames = int(1 + (gt_env.shape[1] - wLength)//wInc) + w = hamming(wLength+1)[:-1] # window is periodic, not symmetric + + energy = np.zeros((n_cochlear_filters, 8, n_frames)) + for i, ac_ch in enumerate(gt_env): + mod_out = modfilt(MF, ac_ch) + for j, mod_ch in enumerate(mod_out): + mod_out_frame = segment_axis(mod_ch, wLength, + overlap=wLength-wInc, + end='pad') + energy[i, j, :] = np.sum((w*mod_out_frame[:n_frames])**2, axis=1) + + if norm: + energy = normalize_energy(energy) + + erbs = np.flipud(calc_erbs(low_freq, fs, n_cochlear_filters)) + + avg_energy = np.mean(energy, axis=2) + total_energy = np.sum(avg_energy) + + AC_energy = np.sum(avg_energy, axis=1) + AC_perc = AC_energy*100/total_energy + + AC_perc_cumsum = np.cumsum(np.flipud(AC_perc)) + K90perc_idx = np.where(AC_perc_cumsum > 90)[0][0] + + BW = erbs[K90perc_idx] + + cutoffs = calc_cutoffs(mod_filter_cfs, fs, 2)[0] + + if (BW > cutoffs[4]) and (BW < cutoffs[5]): + Kstar = 5 + elif (BW > cutoffs[5]) and (BW < cutoffs[6]): + Kstar = 6 + elif (BW > cutoffs[6]) and (BW < cutoffs[7]): + Kstar = 7 + elif (BW > cutoffs[7]): + Kstar = 8 + + return np.sum(avg_energy[:, :4])/np.sum(avg_energy[:, 4:Kstar]), energy + + +def process_file(f, args): + fs, s = readwav(f) + if len(s.shape) > 1: + s = s[:, 0] + if np.issubdtype(s.dtype, np.int): + s = s.astype('float')/np.iinfo(s.dtype).max + r, energy = srmr( + s, fs, n_cochlear_filters=args.n_cochlear_filters, + min_cf=args.min_cf, + max_cf=args.max_cf, + fast=args.fast, + norm=args.norm) + return f, r + + +def main(): + import argparse + import multiprocessing + import functools + + parser = argparse.ArgumentParser( + description='Compute the SRMR metric for a given WAV file') + parser.add_argument( + '-f', '--fast', dest='fast', action='store_true', default=False, + help='Use the faster version based on the gammatonegram') + parser.add_argument( + '-n', '--norm', dest='norm', action='store_true', default=False, + help='Use modulation spectrum energy normalization') + parser.add_argument( + '--ncochlearfilters', dest='n_cochlear_filters', type=int, default=23, + help='Number of filters in the acoustic filterbank') + parser.add_argument( + '--mincf', dest='min_cf', type=float, default=4.0, + help='Center frequency of the first modulation filter') + parser.add_argument( + '--maxcf', dest='max_cf', type=float, default=128.0, + help='Center frequency of the last modulation filter') + parser.add_argument( + 'path', metavar='path', nargs='+', + help='Path of the file or files to be processed.' + ' Can also be a folder.') + args = parser.parse_args() + + if len(args.path) > 1: + p = multiprocessing.Pool(multiprocessing.cpu_count()) + results = dict(p.map(functools.partial(process_file, args=args), + args.path)) + for f in args.path: + print('{}: {}'.format(f, results[f])) + else: + f, r = process_file(args.path[0], args) + print('{}: {}'.format(f, r)) + + +if __name__ == '__main__': + main() diff --git a/scores/srmr/hilbert.py b/scores/srmr/hilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..3f0ba1c3f2d486e13367ab81320ba71d8e874392 --- /dev/null +++ b/scores/srmr/hilbert.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# Copyright 2014 João Felipe Santos, jfsantos@emt.inrs.ca +# +# This file is part of the SRMRpy library, and is licensed under the +# MIT license: https://github.com/jfsantos/SRMRpy/blob/master/LICENSE + +import numpy as np +from numpy.fft import fft, ifft + +# This is copied straight from scipy.signal. The reason is that scipy.signal's version +# will always use the fft and ifft functions from fftpack. If you have Anaconda with an MKL +# license, you can install the package mklfft, which will plug the faster MKL FFT functions +# into numpy. + +def hilbert(x, N=None, axis=-1): + """ + Compute the analytic signal, using the Hilbert transform. + The transformation is done along the last axis by default. + Parameters + ---------- + x : array_like + Signal data. Must be real. + N : int, optional + Number of Fourier components. Default: ``x.shape[axis]`` + axis : int, optional + Axis along which to do the transformation. Default: -1. + Returns + ------- + xa : ndarray + Analytic signal of `x`, of each 1-D array along `axis` + Notes + ----- + The analytic signal ``x_a(t)`` of signal ``x(t)`` is: + .. math:: x_a = F^{-1}(F(x) 2U) = x + i y + where `F` is the Fourier transform, `U` the unit step function, + and `y` the Hilbert transform of `x`. [1]_ + In other words, the negative half of the frequency spectrum is zeroed + out, turning the real-valued signal into a complex signal. The Hilbert + transformed signal can be obtained from ``np.imag(hilbert(x))``, and the + original signal from ``np.real(hilbert(x))``. + References + ---------- + .. [1] Wikipedia, "Analytic signal". + http://en.wikipedia.org/wiki/Analytic_signal + """ + x = np.asarray(x) + if np.iscomplexobj(x): + raise ValueError("x must be real.") + if N is None: + N = x.shape[axis] + # Make N multiple of 16 to make sure the transform will be fast + if N % 16: + N = int(np.ceil(N/16)*16) + if N <= 0: + raise ValueError("N must be positive.") + + Xf = fft(x, N, axis=axis) + h = np.zeros(N) + if N % 2 == 0: + h[0] = h[N // 2] = 1 + h[1:N // 2] = 2 + else: + h[0] = 1 + h[1:(N + 1) // 2] = 2 + + if len(x.shape) > 1: + ind = [np.newaxis] * x.ndim + ind[axis] = slice(None) + h = h[ind] + y = ifft(Xf * h, axis=axis) + return y[:x.shape[axis]] diff --git a/scores/srmr/modulation_filters.py b/scores/srmr/modulation_filters.py new file mode 100644 index 0000000000000000000000000000000000000000..1317958e790ae0c300b74d8932f64f3b1d6c0397 --- /dev/null +++ b/scores/srmr/modulation_filters.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +# Copyright 2014 João Felipe Santos, jfsantos@emt.inrs.ca +# +# This file is part of the SRMRpy library, and is licensed under the +# MIT license: https://github.com/jfsantos/SRMRpy/blob/master/LICENSE + +from __future__ import division +import numpy as np +import scipy.signal as sig + +def make_modulation_filter(w0, Q): + W0 = np.tan(w0/2) + B0 = W0/Q + b = np.array([B0, 0, -B0], dtype=np.float) + a = np.array([(1 + B0 + W0**2), (2*W0**2 - 2), (1 - B0 + W0**2)], dtype=np.float) + return b, a + +def modulation_filterbank(mf, fs, Q): + return [make_modulation_filter(w0, Q) for w0 in 2*np.pi*mf/fs] + +def compute_modulation_cfs(min_cf, max_cf, n): + spacing_factor = (max_cf/min_cf)**(1.0/(n-1)) + cfs = np.zeros(n) + cfs[0] = min_cf + for k in range(1,n): + cfs[k] = cfs[k-1]*spacing_factor + return cfs + +def modfilt(F, x): + y = np.zeros((len(F), len(x)), dtype=np.float) + for k, f in enumerate(F): + y[k] = sig.lfilter(f[0], f[1], x) + return y + diff --git a/scores/srmr/segmentaxis.py b/scores/srmr/segmentaxis.py new file mode 100644 index 0000000000000000000000000000000000000000..4ad5f1c31811e2a64d871091473729bb68702a77 --- /dev/null +++ b/scores/srmr/segmentaxis.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +# Copyright 2014 João Felipe Santos, jfsantos@emt.inrs.ca +# +# This file is part of the SRMRpy library, and is licensed under the +# MIT license: https://github.com/jfsantos/SRMRpy/blob/master/LICENSE + +"""segmentaxis code, originally in scikits.talkbox (https://pypi.python.org/pypi/scikits.talkbox) + +This code has been implemented by Anne Archibald, and has been discussed on the +ML.""" +from __future__ import division +import numpy as np +import warnings + +def segment_axis(a, length, overlap=0, axis=None, end='cut', endvalue=0): + """Generate a new array that chops the given array along the given axis + into overlapping frames. + + example: + >>> segment_axis(arange(10), 4, 2) + array([[0, 1, 2, 3], + [2, 3, 4, 5], + [4, 5, 6, 7], + [6, 7, 8, 9]]) + + arguments: + a The array to segment + length The length of each frame + overlap The number of array elements by which the frames should overlap + axis The axis to operate on; if None, act on the flattened array + end What to do with the last frame, if the array is not evenly + divisible into pieces. Options are: + + 'cut' Simply discard the extra values + 'wrap' Copy values from the beginning of the array + 'pad' Pad with a constant value + + endvalue The value to use for end='pad' + + The array is not copied unless necessary (either because it is unevenly + strided and being flattened or because end is set to 'pad' or 'wrap'). + """ + + if axis is None: + a = np.ravel(a) # may copy + axis = 0 + + l = a.shape[axis] + + if overlap >= length: + raise ValueError("frames cannot overlap by more than 100%") + if overlap < 0 or length <= 0: + raise ValueError("overlap must be nonnegative and length must "\ + "be positive") + + if l < length or (l-length) % (length-overlap): + if l>length: + roundup = length + (1+(l-length)//(length-overlap))*(length-overlap) + rounddown = length + ((l-length)//(length-overlap))*(length-overlap) + else: + roundup = length + rounddown = 0 + assert rounddown < l < roundup + assert roundup == rounddown + (length-overlap) \ + or (roundup == length and rounddown == 0) + a = a.swapaxes(-1,axis) + + if end == 'cut': + a = a[..., :rounddown] + elif end in ['pad','wrap']: # copying will be necessary + s = list(a.shape) + s[-1] = roundup + b = np.empty(s,dtype=a.dtype) + if end in ['pad','wrap']: + b[..., :l] = a + if end == 'pad': + b[..., l:] = endvalue + elif end == 'wrap': + b[..., l:] = a[..., :roundup-l] + a = b + elif end == 'delay': + s = list(a.shape) + l_orig = l + l += overlap + # if l not divisible by length, pad last frame with zeros + if l_orig % (length-overlap): + roundup = length + (1+(l-length)//(length-overlap))*(length-overlap) + else: + roundup = l + s[-1] = roundup + b = np.empty(s,dtype=a.dtype) + + b[..., :(overlap)] = endvalue + b[..., (overlap):(l_orig+overlap)] = a + b[..., (l_orig+overlap):] = endvalue + a = b + else: + raise ValueError("end has to be either 'cut', 'pad', 'wrap', or 'delay'.") + + a = a.swapaxes(-1,axis) + + + l = a.shape[axis] + if l == 0: + raise ValueError("Not enough data points to segment array in 'cut' mode; "\ + "try 'pad' or 'wrap'") + assert l >= length + assert (l-length) % (length-overlap) == 0 + n = 1 + (l-length) // (length-overlap) + s = a.strides[axis] + newshape = a.shape[:axis] + (n,length) + a.shape[axis+1:] + newstrides = a.strides[:axis] + ((length-overlap)*s,s) + a.strides[axis+1:] + + try: + return np.ndarray.__new__(np.ndarray, strides=newstrides, + shape=newshape, buffer=a, dtype=a.dtype) + except TypeError: + warnings.warn("Problem with ndarray creation forces copy.") + a = a.copy() + # Shape doesn't change but strides does + newstrides = a.strides[:axis] + ((length-overlap)*s,s) \ + + a.strides[axis+1:] + return np.ndarray.__new__(np.ndarray, strides=newstrides, + shape=newshape, buffer=a, dtype=a.dtype) diff --git a/scores/srmr/srmr.py b/scores/srmr/srmr.py new file mode 100644 index 0000000000000000000000000000000000000000..053d1bd1d134f688b8d2ade4284dc29439fe2481 --- /dev/null +++ b/scores/srmr/srmr.py @@ -0,0 +1,13 @@ +from basis import ScoreBasis + + +class SRMR(ScoreBasis): + def __init__(self): + super(SRMR, self).__init__(name='SRMR') + self.intrusive = True + self.score_rate = 16000 + + def windowed_scoring(self, audios, score_rate): + from scores.srmr.cal_srmr import cal_SRMR + return cal_SRMR(audios[0], score_rate, n_cochlear_filters=23,low_freq=125, min_cf=4,max_cf=128, fast=True, norm=False)[0] + diff --git a/scores/srmr/vad.py b/scores/srmr/vad.py new file mode 100644 index 0000000000000000000000000000000000000000..aaf014f81e70f7c76a70fd0f77dbfd5bd0023237 --- /dev/null +++ b/scores/srmr/vad.py @@ -0,0 +1,38 @@ +import numpy as np +from srmrpy.segmentaxis import segment_axis + +def simple_energy_vad(x, fs, framelen=0.02, theta_main=30, theta_min=-55): + '''Simple energy voice activity detection algorithm based on energy + thresholds as described in Tomi Kinnunen and Padmanabhan Rajan, "A + practical, self-adaptive voice activity detector for speaker verification + with noisy telephone and microphone data", ICASSP 2013, Vancouver (NOTE: + this is the benchmark method, not the method proposed by the authors). + ''' + # Split signal in frames + framelen = int(framelen * fs) + frames = segment_axis(x, length=framelen, overlap=0, end='pad') + frames_zero_mean = frames - frames.mean(axis=0) + frame_energy = 10*np.log10(1/(framelen-1) * (frames_zero_mean**2).sum(axis=1) + 1e-6) + max_energy = max(frame_energy) + speech_presence = (frame_energy > max_energy - theta_main) & (frame_energy > theta_min) + x_vad = np.zeros_like(x, dtype=bool) + for idx, frame in enumerate(frames): + if speech_presence[idx]: + x_vad[idx*framelen:(idx+1)*framelen] = True + else: + x_vad[idx*framelen:(idx+1)*framelen] = False + return x[x_vad], x_vad + +if __name__ == '__main__': + import sys + from scipy.io.wavfile import read as readwav + from matplotlib import pyplot as plt + + fs, s = readwav(sys.argv[1]) + s = s.astype('float')/np.iinfo(s.dtype).max + s_vad, speech_presence = simple_energy_vad(s, fs) + + plt.plot(s) + plt.plot(s_vad - 1, 'g') + plt.show() + diff --git a/scores/ssnr.py b/scores/ssnr.py new file mode 100644 index 0000000000000000000000000000000000000000..5418a7f7189e0f2c5aa5d970e69ab04abd4ca2bf --- /dev/null +++ b/scores/ssnr.py @@ -0,0 +1,58 @@ +from basis import ScoreBasis +import numpy as np + +class SSNR(ScoreBasis): + def __init__(self): + super(SSNR, self).__init__(name='SSNR') + self.intrusive = False + + def windowed_scoring(self, audios, score_rate): + if len(audios) != 2: + raise ValueError('SSNR needs a reference and a test signals.') + return cal_SSNR(audios[0], audios[1], score_rate) + +def cal_SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10): + # obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py + """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure + This function implements the segmental signal-to-noise ratio + as defined in [1, p. 45] (see Equation 2.12). + """ + clean_speech = ref_wav + processed_speech = deg_wav + clean_length = ref_wav.shape[0] + processed_length = deg_wav.shape[0] + + # scale both to have same dynamic range. Remove DC too. + clean_speech -= clean_speech.mean() + processed_speech -= processed_speech.mean() + processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech))) + + # global variables + winlength = int(np.round(30 * srate / 1000)) # 30 msecs + skiprate = winlength // 4 + MIN_SNR = -10 + MAX_SNR = 35 + + # For each frame, calculate SSNR + num_frames = int(clean_length / skiprate - (winlength/skiprate)) + start = 0 + time = np.linspace(1, winlength, winlength) / (winlength + 1) + window = 0.5 * (1 - np.cos(2 * np.pi * time)) + segmental_snr = [] + + for frame_count in range(int(num_frames)): + # (1) get the frames for the test and ref speech. + # Apply Hanning Window + clean_frame = clean_speech[start:start+winlength] + processed_frame = processed_speech[start:start+winlength] + clean_frame = clean_frame * window + processed_frame = processed_frame * window + + # (2) Compute Segmental SNR + signal_energy = np.sum(clean_frame ** 2) + noise_energy = np.sum((clean_frame - processed_frame) ** 2) + segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps)) + segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR) + segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR) + start += int(skiprate) + return sum(segmental_snr) / len(segmental_snr) diff --git a/scores/stoi.py b/scores/stoi.py new file mode 100644 index 0000000000000000000000000000000000000000..e8dba9cf5ff41fd282615ef431959cda3f0b71d2 --- /dev/null +++ b/scores/stoi.py @@ -0,0 +1,16 @@ +from basis import ScoreBasis + + +class STOI(ScoreBasis): + def __init__(self): + super(STOI, self).__init__(name='STOI') + self.intrusive = False + self.mono = True + + def windowed_scoring(self, audios, score_rate): + from pystoi.stoi import stoi + if len(audios) != 2: + raise ValueError('STOI needs a reference and a test signals.') + + return stoi(audios[1], audios[0], score_rate, extended=False) + diff --git a/scores/wSDR.py b/scores/wSDR.py new file mode 100644 index 0000000000000000000000000000000000000000..d4eb7bac3e3626bc135446f54d7281bd4b088fa0 --- /dev/null +++ b/scores/wSDR.py @@ -0,0 +1,37 @@ +import torch + + +class WeightedSDR: + def __init__(self): + self.loss = weighted_signal_distortion_ratio_loss + + def __call__(self, output, bd): + return self.loss(output, bd) + + +def dotproduct(y, y_hat): + # batch x channel x nsamples + return torch.bmm(y.view(y.shape[0], 1, y.shape[-1]), y_hat.view(y_hat.shape[0], y_hat.shape[-1], 1)).reshape(-1) + + +def weighted_signal_distortion_ratio_loss(output, bd): + y = bd['y'] # target signal + z = bd['z'] # noise signal + + y_hat = output + z_hat = bd['x'] - y_hat # expected noise signal + + # mono channel only... + # can i fix this? + y_norm = torch.norm(y, dim=-1).squeeze(1) + z_norm = torch.norm(z, dim=-1).squeeze(1) + y_hat_norm = torch.norm(y_hat, dim=-1).squeeze(1) + z_hat_norm = torch.norm(z_hat, dim=-1).squeeze(1) + + def loss_sdr(a, a_hat, a_norm, a_hat_norm): + return dotproduct(a, a_hat) / (a_norm * a_hat_norm + 1e-8) + + alpha = y_norm.pow(2) / (y_norm.pow(2) + z_norm.pow(2) + 1e-8) + loss_wSDR = -alpha * loss_sdr(y, y_hat, y_norm, y_hat_norm) - (1 - alpha) * loss_sdr(z, z_hat, z_norm, z_hat_norm) + + return loss_wSDR.mean() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..abe1d804809214db63d24c27f0022f2dd3f13215 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ + +# -*- coding: utf-8 -*- + + +from setuptools import setup, find_packages + +setup( + name="metrics", + version="1.0", + packages=find_packages(), + + install_requires=[ + 'numpy<1.24', + 'scipy', + 'tqdm', + 'resampy', + 'pystoi', + 'museval', + 'tensorflow>=2.0.0', + 'librosa', + # This is requred, but srmrpy pull it in, + # and there is a pip3 conflict if we have the following + # line. + #'gammatone @ git+https://github.com/detly/gammatone', + 'pypesq @ git+https://github.com/vBaiCai/python-pesq', + 'srmrpy @ git+https://github.com/jfsantos/SRMRpy', + 'pesq @ git+https://github.com/ludlows/python-pesq', + ], + include_package_data=True +) diff --git a/speechscore.py b/speechscore.py new file mode 100644 index 0000000000000000000000000000000000000000..2745ed6a99d0d14f3ea2dd8cbc60b2c2b0eb84e2 --- /dev/null +++ b/speechscore.py @@ -0,0 +1,210 @@ +import os +import librosa +import soundfile as sf +import resampy +import numpy as np +from scores.srmr.srmr import SRMR +from scores.dnsmos.dnsmos import DNSMOS +from scores.pesq import PESQ +from scores.nb_pesq import NB_PESQ +from scores.sisdr import SISDR +from scores.stoi import STOI +from scores.fwsegsnr import FWSEGSNR +from scores.lsd import LSD +from scores.bsseval import BSSEval +from scores.snr import SNR +from scores.ssnr import SSNR +from scores.llr import LLR +from scores.csig import CSIG +from scores.cbak import CBAK +from scores.covl import COVL +from scores.mcd import MCD + +def compute_mean_results(*results): + mean_result = {} + + # Use the first dictionary as a reference for keys + for key in results[0]: + # If the value is a nested dictionary, recurse + if isinstance(results[0][key], dict): + nested_results = [d[key] for d in results] + mean_result[key] = compute_mean_results(*nested_results) + # Otherwise, compute the mean of the values + else: + mean_result[key] = sum(d[key] for d in results) / len(results) + + return mean_result + +class ScoresList: + def __init__(self): + self.scores = [] + + def __add__(self, score): + self.scores += [score] + return self + + def __str__(self): + return 'Scores: ' + ' '.join([x.name for x in self.scores]) + + def __call__(self, test_path, reference_path, window=None, score_rate=None, return_mean=False): + """ + window: float + the window length in seconds to use for scoring the files. + score_rate: + the sampling rate specified for scoring the files. + """ + if test_path is None: + print(f'Please provide audio path for test_path') + return + results = {} + + if os.path.isdir(test_path): + audio_list = self.get_audio_list(test_path) + if audio_list is None: return + for audio_id in audio_list: + results_id = {} + if reference_path is not None: + data = self.audio_reader(test_path+'/'+audio_id, reference_path+'/'+audio_id) + else: + data = self.audio_reader(test_path+'/'+audio_id, None) + for score in self.scores: + result_score = score.scoring(data, window, score_rate) + results_id[score.name] = result_score + results[audio_id] = results_id + else: + data = self.audio_reader(test_path, reference_path) + for score in self.scores: + result_score = score.scoring(data, window, score_rate) + results[score.name] = result_score + + if return_mean: + mean_result = compute_mean_results(*results.values()) + results['Mean_Score'] = mean_result + + return results + + def get_audio_list(self, path): + # Initialize an empty list to store audio file names + audio_list = [] + + # Find all '.wav' audio files in the given path + path_list = librosa.util.find_files(path, ext="wav") + + # If no '.wav' files are found, try to find '.flac' audio files instead + if len(path_list) == 0: + path_list = librosa.util.find_files(path, ext="flac") + + # If no audio files are found at all, print an error message and return None + if len(path_list) == 0: + print(f'No audio files found in {path}, scoring ended!') + return None + + # Loop through the list of found audio file paths + for audio_path in path_list: + # Split the file path by '/' and append the last element (the file name) to the audio_list + audio_path_s = audio_path.split('/') + audio_list.append(audio_path_s[-1]) + + # Return the list of audio file names + return audio_list + + def audio_reader(self, test_path, reference_path): + """loading sound files and making sure they all have the same lengths + (zero-padding to the largest). Also works with numpy arrays. + """ + data = {} + audios = [] + maxlen = 0 + audio_test, rate_test = sf.read(test_path, always_2d=True) + + if audio_test.shape[1] > 1: + audio_test = audio_test[..., 0, None] + + rate = rate_test + if reference_path is not None: + audio_ref, rate_ref = sf.read(reference_path, always_2d=True) + if audio_ref.shape[1] > 1: + audio_ref = audio_ref[..., 0, None] + if rate_test != rate_ref: + rate = min(rate_test, rate_ref) + if rate_test != rate: + audio_test = resampy.resample(audio_test, rate_test, rate, axis=0) + if rate_ref != rate: + audio_ref = resampy.resample(audio_ref, rate_ref, rate, axis=0) + audios += [audio_test] + audios += [audio_ref] + else: + audios += [audio_test] + + maxlen = 0 + for index, audio in enumerate(audios): + maxlen = max(maxlen, audio.shape[0]) + ##padding + for index, audio in enumerate(audios): + if audio.shape[0] != maxlen: + new = np.zeros((maxlen,)) + new[:audio.shape[0]] = audio[...,0] + audios[index] = new + else: + audios[index] = audio[...,0] + data['audio'] = audios + data['rate'] = rate + return data + +def SpeechScore(scores=''): + """ Load the desired scores inside a Metrics object that can then + be called to compute all the desired scores. + + Parameters: + ---------- + scores: str or list of str + the scores matching any of these will be automatically loaded. this + match is relative to the structure of the speechscores package. + For instance: + * 'absolute' will match all non-instrusive scores + * 'absolute.srmr' or 'srmr' will only match SRMR + * '' will match all + + Returns: + -------- + + A ScoresList object, that can be run to get the desired scores + """ + + score_cls = ScoresList() + for score in scores: + if score.lower() == 'srmr': + score_cls += SRMR() + elif score.lower() == 'pesq': + score_cls += PESQ() + elif score.lower() == 'nb_pesq': + score_cls += NB_PESQ() + elif score.lower() == 'stoi': + score_cls += STOI() + elif score.lower() == 'sisdr': + score_cls += SISDR() + elif score.lower() == 'fwsegsnr': + score_cls += FWSEGSNR() + elif score.lower() == 'lsd': + score_cls += LSD() + elif score.lower() == 'bsseval': + score_cls += BSSEval() + elif score.lower() == 'dnsmos': + score_cls += DNSMOS() + elif score.lower() == 'snr': + score_cls += SNR() + elif score.lower() == 'ssnr': + score_cls += SSNR() + elif score.lower() == 'llr': + score_cls += LLR() + elif score.lower() == 'csig': + score_cls += CSIG() + elif score.lower() == 'cbak': + score_cls += CBAK() + elif score.lower() == 'covl': + score_cls += COVL() + elif score.lower() == 'mcd': + score_cls += MCD() + else: + print('score is pending implementation...') + return score_cls