Spaces:
Running
Running
Upload 73 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- __init__.py +169 -0
- __pycache__/basis.cpython-38.pyc +0 -0
- __pycache__/metric_loader.cpython-38.pyc +0 -0
- __pycache__/metrics.cpython-38.pyc +0 -0
- __pycache__/speechscore.cpython-38.pyc +0 -0
- audios/clean/audio_1.wav +0 -0
- audios/clean/audio_2.wav +0 -0
- audios/noisy/audio_1.wav +0 -0
- audios/noisy/audio_2.wav +0 -0
- audios/ref.wav +0 -0
- audios/test.wav +0 -0
- basis.py +113 -0
- demo.py +29 -0
- requirement.txt +5 -0
- scores/__init__.py +0 -0
- scores/__pycache__/__init__.cpython-38.pyc +0 -0
- scores/__pycache__/bsseval.cpython-38.pyc +0 -0
- scores/__pycache__/cbak.cpython-38.pyc +0 -0
- scores/__pycache__/covl.cpython-38.pyc +0 -0
- scores/__pycache__/csig.cpython-38.pyc +0 -0
- scores/__pycache__/fwsegsnr.cpython-38.pyc +0 -0
- scores/__pycache__/helper.cpython-38.pyc +0 -0
- scores/__pycache__/llr.cpython-38.pyc +0 -0
- scores/__pycache__/lsd.cpython-38.pyc +0 -0
- scores/__pycache__/mcd.cpython-38.pyc +0 -0
- scores/__pycache__/nb_pesq.cpython-38.pyc +0 -0
- scores/__pycache__/pesq.cpython-38.pyc +0 -0
- scores/__pycache__/sisdr.cpython-38.pyc +0 -0
- scores/__pycache__/snr.cpython-38.pyc +0 -0
- scores/__pycache__/ssnr.cpython-38.pyc +0 -0
- scores/__pycache__/stoi.cpython-38.pyc +0 -0
- scores/bsseval.py +21 -0
- scores/cbak.py +37 -0
- scores/covl.py +39 -0
- scores/csig.py +38 -0
- scores/dnsmos/DNSMOS/bak_ovr.onnx +3 -0
- scores/dnsmos/DNSMOS/model_v8.onnx +3 -0
- scores/dnsmos/DNSMOS/sig.onnx +3 -0
- scores/dnsmos/DNSMOS/sig_bak_ovr.onnx +3 -0
- scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc +0 -0
- scores/dnsmos/dnsmos.py +94 -0
- scores/fwsegsnr.py +49 -0
- scores/helper.py +307 -0
- scores/helper_bk.py +438 -0
- scores/llr.py +66 -0
- scores/lsd.py +30 -0
- scores/mcd.py +136 -0
- scores/mosnet/__init__.py +21 -0
- scores/mosnet/__pycache__/__init__.cpython-38.pyc +0 -0
- scores/mosnet/cnn_blstm.h5 +3 -0
__init__.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class Metric:
|
| 2 |
+
def __init__(self, name, window, hop=None, verbose=False):
|
| 3 |
+
# the metric operates on some fixed rate only or only on mono ?
|
| 4 |
+
self.fixed_rate = None
|
| 5 |
+
self.mono = False
|
| 6 |
+
|
| 7 |
+
# is the metric absolute or relative ?
|
| 8 |
+
self.absolute = False
|
| 9 |
+
|
| 10 |
+
# length and hop of windows
|
| 11 |
+
self.window = window
|
| 12 |
+
if hop is None:
|
| 13 |
+
hop = window
|
| 14 |
+
self.hop = hop
|
| 15 |
+
self.name = name
|
| 16 |
+
self.verbose = verbose
|
| 17 |
+
|
| 18 |
+
def test_window(self, audios, rate):
|
| 19 |
+
raise NotImplementedError
|
| 20 |
+
|
| 21 |
+
def test(self, *test_files, array_rate=None):
|
| 22 |
+
"""loading sound files and making sure they all have the same lengths
|
| 23 |
+
(zero-padding to the largest). Also works with numpy arrays.
|
| 24 |
+
Then, calling the `test_window` function that should be specialised
|
| 25 |
+
depending on the metric."""
|
| 26 |
+
|
| 27 |
+
# imports
|
| 28 |
+
import soundfile as sf
|
| 29 |
+
import resampy
|
| 30 |
+
from museval.metrics import Framing
|
| 31 |
+
import numpy as np
|
| 32 |
+
|
| 33 |
+
audios = []
|
| 34 |
+
maxlen = 0
|
| 35 |
+
if isinstance(test_files, str):
|
| 36 |
+
test_files = [test_files]
|
| 37 |
+
if self.absolute and len(test_files) > 1:
|
| 38 |
+
if self.verbose:
|
| 39 |
+
print(' [%s] is absolute. Processing first file only'
|
| 40 |
+
% self.name)
|
| 41 |
+
test_files = [test_files[0],]
|
| 42 |
+
|
| 43 |
+
for file in test_files:
|
| 44 |
+
# Loading sound file
|
| 45 |
+
if isinstance(file, str):
|
| 46 |
+
audio, rate = sf.read(file, always_2d=True)
|
| 47 |
+
else:
|
| 48 |
+
rate = array_rate
|
| 49 |
+
if rate is None:
|
| 50 |
+
raise ValueError('Sampling rate needs to be specified '
|
| 51 |
+
'when feeding numpy arrays.')
|
| 52 |
+
audio = file
|
| 53 |
+
# Standardize shapes
|
| 54 |
+
if len(audio.shape) == 1:
|
| 55 |
+
audio = audio[:, None]
|
| 56 |
+
if len(audio.shape) != 2:
|
| 57 |
+
raise ValueError('Please provide 1D or 2D array, received '
|
| 58 |
+
'{}D array'.format(len(audio.shape)))
|
| 59 |
+
|
| 60 |
+
if self.fixed_rate is not None and rate != self.fixed_rate:
|
| 61 |
+
if self.verbose:
|
| 62 |
+
print(' [%s] preferred is %dkHz rate. resampling'
|
| 63 |
+
% (self.name, self.fixed_rate))
|
| 64 |
+
audio = resampy.resample(audio, rate, self.fixed_rate, axis=0)
|
| 65 |
+
rate = self.fixed_rate
|
| 66 |
+
if self.mono and audio.shape[1] > 1:
|
| 67 |
+
if self.verbose:
|
| 68 |
+
print(' [%s] only supports mono. Will use first channel'
|
| 69 |
+
% self.name)
|
| 70 |
+
audio = audio[..., 0, None]
|
| 71 |
+
if self.mono:
|
| 72 |
+
audio = audio[..., 0]
|
| 73 |
+
maxlen = max(maxlen, audio.shape[0])
|
| 74 |
+
audios += [audio]
|
| 75 |
+
|
| 76 |
+
for index, audio in enumerate(audios):
|
| 77 |
+
if audio.shape[0] != maxlen:
|
| 78 |
+
new = np.zeros((maxlen,) + audio.shape[1:])
|
| 79 |
+
new[:audio.shape[0]] = audio
|
| 80 |
+
audios[index] = new
|
| 81 |
+
|
| 82 |
+
if self.window is not None:
|
| 83 |
+
framer = Framing(self.window * rate,
|
| 84 |
+
self.hop * rate, maxlen)
|
| 85 |
+
nwin = framer.nwin
|
| 86 |
+
result = {}
|
| 87 |
+
for (t, win) in enumerate(framer):
|
| 88 |
+
result_t = self.test_window([audio[win] for audio in audios],
|
| 89 |
+
rate)
|
| 90 |
+
for metric in result_t.keys():
|
| 91 |
+
if metric not in result.keys():
|
| 92 |
+
result[metric] = np.empty(nwin)
|
| 93 |
+
result[metric][t] = result_t[metric]
|
| 94 |
+
else:
|
| 95 |
+
result = self.test_window(audios, rate)
|
| 96 |
+
return result
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
import absolute
|
| 100 |
+
import relative
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class MetricsList:
|
| 104 |
+
def __init__(self):
|
| 105 |
+
self.metrics = []
|
| 106 |
+
|
| 107 |
+
def __add__(self, metric):
|
| 108 |
+
self.metrics += [metric]
|
| 109 |
+
return self
|
| 110 |
+
|
| 111 |
+
def __str__(self):
|
| 112 |
+
return 'Metrics: ' + ' '.join([x.name for x in self.metrics])
|
| 113 |
+
|
| 114 |
+
def __call__(self, *files, rate=None):
|
| 115 |
+
result = {}
|
| 116 |
+
for metric in self.metrics:
|
| 117 |
+
result_metric = metric.test(*files, array_rate=rate)
|
| 118 |
+
for name in result_metric.keys():
|
| 119 |
+
result[name] = result_metric[name]
|
| 120 |
+
return result
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def load(metrics='', window=2, verbose=False):
|
| 124 |
+
""" Load the desired metrics inside a Metrics object that can then
|
| 125 |
+
be called to compute all the desired metrics.
|
| 126 |
+
|
| 127 |
+
Parameters:
|
| 128 |
+
----------
|
| 129 |
+
metrics: str or list of str
|
| 130 |
+
the metrics matching any of these will be automatically loaded. this
|
| 131 |
+
match is relative to the structure of the speechmetrics package.
|
| 132 |
+
For instance:
|
| 133 |
+
* 'absolute' will match all absolute metrics
|
| 134 |
+
* 'absolute.srmr' or 'srmr' will only match SRMR
|
| 135 |
+
* '' will match all
|
| 136 |
+
|
| 137 |
+
window: float
|
| 138 |
+
the window length to use for testing the files.
|
| 139 |
+
|
| 140 |
+
verbose: boolean
|
| 141 |
+
will display information during computations
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
--------
|
| 145 |
+
|
| 146 |
+
A MetricsList object, that can be run to get the desired metrics
|
| 147 |
+
"""
|
| 148 |
+
import pkgutil
|
| 149 |
+
import importlib
|
| 150 |
+
|
| 151 |
+
result = MetricsList()
|
| 152 |
+
|
| 153 |
+
found_modules = []
|
| 154 |
+
iterator = pkgutil.walk_packages(__path__, __name__ + '.')
|
| 155 |
+
|
| 156 |
+
if isinstance(metrics, str):
|
| 157 |
+
metrics = [metrics]
|
| 158 |
+
for module_info in iterator:
|
| 159 |
+
if any([metric in module_info.name for metric in metrics]):
|
| 160 |
+
module = importlib.import_module(module_info.name)
|
| 161 |
+
if module not in found_modules:
|
| 162 |
+
found_modules += [module],
|
| 163 |
+
if hasattr(module, 'load'):
|
| 164 |
+
load_function = getattr(module, 'load')
|
| 165 |
+
new_metric = load_function(window)
|
| 166 |
+
new_metric.verbose = verbose
|
| 167 |
+
result += new_metric
|
| 168 |
+
print('Loaded ', module_info.name)
|
| 169 |
+
return result
|
__pycache__/basis.cpython-38.pyc
ADDED
|
Binary file (1.57 kB). View file
|
|
|
__pycache__/metric_loader.cpython-38.pyc
ADDED
|
Binary file (3.48 kB). View file
|
|
|
__pycache__/metrics.cpython-38.pyc
ADDED
|
Binary file (2.72 kB). View file
|
|
|
__pycache__/speechscore.cpython-38.pyc
ADDED
|
Binary file (5.95 kB). View file
|
|
|
audios/clean/audio_1.wav
ADDED
|
Binary file (76.8 kB). View file
|
|
|
audios/clean/audio_2.wav
ADDED
|
Binary file (76.8 kB). View file
|
|
|
audios/noisy/audio_1.wav
ADDED
|
Binary file (76.8 kB). View file
|
|
|
audios/noisy/audio_2.wav
ADDED
|
Binary file (76.8 kB). View file
|
|
|
audios/ref.wav
ADDED
|
Binary file (76.8 kB). View file
|
|
|
audios/test.wav
ADDED
|
Binary file (76.8 kB). View file
|
|
|
basis.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class ScoreBasis:
|
| 2 |
+
def __init__(self, name=None):
|
| 3 |
+
# the score operates on the specified rate
|
| 4 |
+
self.score_rate = None
|
| 5 |
+
# is the score intrusive or non-intrusive ?
|
| 6 |
+
self.intrusive = True #require a reference
|
| 7 |
+
self.name = name
|
| 8 |
+
|
| 9 |
+
def windowed_scoring(self, audios, score_rate):
|
| 10 |
+
raise NotImplementedError(f'In {self.name}, windowed_scoring is not yet implemented')
|
| 11 |
+
|
| 12 |
+
def scoring(self, data, window=None, score_rate=None):
|
| 13 |
+
""" calling the `windowed_scoring` function that should be specialised
|
| 14 |
+
depending on the score."""
|
| 15 |
+
|
| 16 |
+
# imports
|
| 17 |
+
#import soundfile as sf
|
| 18 |
+
import resampy
|
| 19 |
+
from museval.metrics import Framing
|
| 20 |
+
|
| 21 |
+
#checking rate
|
| 22 |
+
audios = data['audio']
|
| 23 |
+
score_rate = data['rate']
|
| 24 |
+
|
| 25 |
+
if self.score_rate is not None:
|
| 26 |
+
score_rate = self.score_rate
|
| 27 |
+
|
| 28 |
+
if score_rate != data['rate']:
|
| 29 |
+
for index, audio in enumerate(audios):
|
| 30 |
+
audio = resampy.resample(audio, data['rate'], score_rate, axis=0)
|
| 31 |
+
audios[index] = audio
|
| 32 |
+
|
| 33 |
+
if window is not None:
|
| 34 |
+
framer = Framing(window * score_rate, window * score_rate, maxlen)
|
| 35 |
+
nwin = framer.nwin
|
| 36 |
+
result = {}
|
| 37 |
+
for (t, win) in enumerate(framer):
|
| 38 |
+
result_t = self.windowed_scoring([audio[win] for audio in audios], score_rate)
|
| 39 |
+
result[t] = result_t
|
| 40 |
+
else:
|
| 41 |
+
result = self.windowed_scoring(audios, score_rate)
|
| 42 |
+
return result
|
| 43 |
+
"""
|
| 44 |
+
audios = []
|
| 45 |
+
maxlen = 0
|
| 46 |
+
if isinstance(test_files, str):
|
| 47 |
+
test_files = [test_files]
|
| 48 |
+
print(f'test_files: {test_files}')
|
| 49 |
+
if not self.intrusive and len(test_files) > 1:
|
| 50 |
+
if self.verbose:
|
| 51 |
+
print(' [%s] is non-intrusive. Processing first file only'
|
| 52 |
+
% self.name)
|
| 53 |
+
test_files = [test_files[0],]
|
| 54 |
+
for file in test_files:
|
| 55 |
+
# Loading sound file
|
| 56 |
+
if isinstance(file, str):
|
| 57 |
+
audio, rate = sf.read(file, always_2d=True)
|
| 58 |
+
else:
|
| 59 |
+
rate = array_rate
|
| 60 |
+
if rate is None:
|
| 61 |
+
raise ValueError('Sampling rate needs to be specified '
|
| 62 |
+
'when feeding numpy arrays.')
|
| 63 |
+
audio = file
|
| 64 |
+
# Standardize shapes
|
| 65 |
+
if len(audio.shape) == 1:
|
| 66 |
+
audio = audio[:, None]
|
| 67 |
+
if len(audio.shape) != 2:
|
| 68 |
+
raise ValueError('Please provide 1D or 2D array, received '
|
| 69 |
+
'{}D array'.format(len(audio.shape)))
|
| 70 |
+
|
| 71 |
+
if self.fixed_rate is not None and rate != self.fixed_rate:
|
| 72 |
+
if self.verbose:
|
| 73 |
+
print(' [%s] preferred is %dkHz rate. resampling'
|
| 74 |
+
% (self.name, self.fixed_rate))
|
| 75 |
+
audio = resampy.resample(audio, rate, self.fixed_rate, axis=0)
|
| 76 |
+
rate = self.fixed_rate
|
| 77 |
+
if self.mono and audio.shape[1] > 1:
|
| 78 |
+
if self.verbose:
|
| 79 |
+
print(' [%s] only supports mono. Will use first channel'
|
| 80 |
+
% self.name)
|
| 81 |
+
audio = audio[..., 0, None]
|
| 82 |
+
if self.mono:
|
| 83 |
+
audio = audio[..., 0]
|
| 84 |
+
maxlen = max(maxlen, audio.shape[0])
|
| 85 |
+
audios += [audio]
|
| 86 |
+
audio = audios[1]
|
| 87 |
+
audio[:maxlen-320] = audio[320:]
|
| 88 |
+
audios[1] = audio
|
| 89 |
+
for index, audio in enumerate(audios):
|
| 90 |
+
if audio.shape[0] != maxlen:
|
| 91 |
+
new = np.zeros((maxlen,) + audio.shape[1:])
|
| 92 |
+
new[:audio.shape[0]] = audio
|
| 93 |
+
audios[index] = new
|
| 94 |
+
|
| 95 |
+
if self.window is not None:
|
| 96 |
+
framer = Framing(self.window * rate,
|
| 97 |
+
self.hop * rate, maxlen)
|
| 98 |
+
nwin = framer.nwin
|
| 99 |
+
result = {}
|
| 100 |
+
for (t, win) in enumerate(framer):
|
| 101 |
+
result_t = self.test_window([audio[win] for audio in audios],
|
| 102 |
+
rate)
|
| 103 |
+
#or metric in result_t.keys():
|
| 104 |
+
# if metric not in result.keys():
|
| 105 |
+
# result[metric] = np.empty(nwin)
|
| 106 |
+
# result[metric][t] = result_t[metric]
|
| 107 |
+
result[t] = result_t
|
| 108 |
+
else:
|
| 109 |
+
result = self.test_window(audios, rate)
|
| 110 |
+
return result
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
|
demo.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import pprint for pretty-printing the results in a more readable format
|
| 2 |
+
import pprint
|
| 3 |
+
# Import the SpeechScore class to evaluate speech quality metrics
|
| 4 |
+
from speechscore import SpeechScore
|
| 5 |
+
|
| 6 |
+
# Main block to ensure the code runs only when executed directly
|
| 7 |
+
if __name__ == '__main__':
|
| 8 |
+
# Initialize a SpeechScore object with a list of score metrics to be evaluated
|
| 9 |
+
# Supports any subsets of the list
|
| 10 |
+
mySpeechScore = SpeechScore([
|
| 11 |
+
'SRMR', 'PESQ', 'NB_PESQ', 'STOI', 'SISDR',
|
| 12 |
+
'FWSEGSNR', 'LSD', 'BSSEval', 'DNSMOS',
|
| 13 |
+
'SNR', 'SSNR', 'LLR', 'CSIG', 'CBAK',
|
| 14 |
+
'COVL', 'MCD'
|
| 15 |
+
])
|
| 16 |
+
|
| 17 |
+
# Call the SpeechScore object to evaluate the speech metrics between 'noisy' and 'clean' audio
|
| 18 |
+
# Arguments:
|
| 19 |
+
# - {test_path, reference_path} supports audio directories or audio paths (.wav or .flac)
|
| 20 |
+
# - window (float): seconds, set None to specify no windowing (process the full audio)
|
| 21 |
+
# - score_rate (int): specifies the sampling rate at which the metrics should be computed
|
| 22 |
+
# - return_mean (bool): set True to specify that the mean score for each metric should be returned
|
| 23 |
+
scores = mySpeechScore(test_path='audios/noisy/', reference_path='audios/clean/', window=None, score_rate=16000, return_mean=True)
|
| 24 |
+
|
| 25 |
+
# Pretty-print the resulting scores in a readable format
|
| 26 |
+
pprint.pprint(scores)
|
| 27 |
+
|
| 28 |
+
# Print only the resulting mean scores in a readable format
|
| 29 |
+
pprint.pprint(scores['Mean_Score'])
|
requirement.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pysptk
|
| 2 |
+
pymcd
|
| 3 |
+
pyworld
|
| 4 |
+
fastdtw
|
| 5 |
+
museval
|
scores/__init__.py
ADDED
|
File without changes
|
scores/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (187 Bytes). View file
|
|
|
scores/__pycache__/bsseval.cpython-38.pyc
ADDED
|
Binary file (1.15 kB). View file
|
|
|
scores/__pycache__/cbak.cpython-38.pyc
ADDED
|
Binary file (1.48 kB). View file
|
|
|
scores/__pycache__/covl.cpython-38.pyc
ADDED
|
Binary file (1.53 kB). View file
|
|
|
scores/__pycache__/csig.cpython-38.pyc
ADDED
|
Binary file (1.52 kB). View file
|
|
|
scores/__pycache__/fwsegsnr.cpython-38.pyc
ADDED
|
Binary file (2.09 kB). View file
|
|
|
scores/__pycache__/helper.cpython-38.pyc
ADDED
|
Binary file (6.64 kB). View file
|
|
|
scores/__pycache__/llr.cpython-38.pyc
ADDED
|
Binary file (2.09 kB). View file
|
|
|
scores/__pycache__/lsd.cpython-38.pyc
ADDED
|
Binary file (1.5 kB). View file
|
|
|
scores/__pycache__/mcd.cpython-38.pyc
ADDED
|
Binary file (4.65 kB). View file
|
|
|
scores/__pycache__/nb_pesq.cpython-38.pyc
ADDED
|
Binary file (922 Bytes). View file
|
|
|
scores/__pycache__/pesq.cpython-38.pyc
ADDED
|
Binary file (921 Bytes). View file
|
|
|
scores/__pycache__/sisdr.cpython-38.pyc
ADDED
|
Binary file (1.2 kB). View file
|
|
|
scores/__pycache__/snr.cpython-38.pyc
ADDED
|
Binary file (1.52 kB). View file
|
|
|
scores/__pycache__/ssnr.cpython-38.pyc
ADDED
|
Binary file (2.05 kB). View file
|
|
|
scores/__pycache__/stoi.cpython-38.pyc
ADDED
|
Binary file (926 Bytes). View file
|
|
|
scores/bsseval.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from basis import ScoreBasis
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class BSSEval(ScoreBasis):
|
| 6 |
+
def __init__(self):
|
| 7 |
+
super(BSSEval, self).__init__(name='BSSEval')
|
| 8 |
+
self.intrusive = False
|
| 9 |
+
|
| 10 |
+
def windowed_scoring(self, audios, score_rate):
|
| 11 |
+
bss_window = np.inf
|
| 12 |
+
bss_hop = np.inf
|
| 13 |
+
from museval.metrics import bss_eval
|
| 14 |
+
if len(audios) != 2:
|
| 15 |
+
raise ValueError('BSSEval needs a reference and a test signals.')
|
| 16 |
+
|
| 17 |
+
result = bss_eval(reference_sources=audios[1][None,...], # shape: [nsrc, nsample, nchannels]
|
| 18 |
+
estimated_sources=audios[0][None,...],
|
| 19 |
+
window=bss_window * score_rate,
|
| 20 |
+
hop=bss_hop * score_rate)
|
| 21 |
+
return {'SDR': result[0][0][0], 'ISR': result[1][0][0], 'SAR': result[3][0][0]}
|
scores/cbak.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from basis import ScoreBasis
|
| 2 |
+
import numpy as np
|
| 3 |
+
from pesq import pesq
|
| 4 |
+
from scores.helper import wss, llr, SSNR, trim_mos
|
| 5 |
+
|
| 6 |
+
class CBAK(ScoreBasis):
|
| 7 |
+
def __init__(self):
|
| 8 |
+
super(CBAK, self).__init__(name='CBAK')
|
| 9 |
+
self.score_rate = 16000
|
| 10 |
+
self.intrusive = False
|
| 11 |
+
|
| 12 |
+
def windowed_scoring(self, audios, score_rate):
|
| 13 |
+
if len(audios) != 2:
|
| 14 |
+
raise ValueError('CBAK needs a reference and a test signals.')
|
| 15 |
+
return cal_CBAK(audios[0], audios[1], score_rate)
|
| 16 |
+
|
| 17 |
+
def cal_CBAK(target_wav, pred_wav, fs):
|
| 18 |
+
alpha = 0.95
|
| 19 |
+
|
| 20 |
+
# Compute WSS measure
|
| 21 |
+
wss_dist_vec = wss(target_wav, pred_wav, fs)
|
| 22 |
+
wss_dist_vec = sorted(wss_dist_vec, reverse=False)
|
| 23 |
+
wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
|
| 24 |
+
|
| 25 |
+
# Compute the SSNR
|
| 26 |
+
snr_mean, segsnr_mean = SSNR(target_wav, pred_wav, fs)
|
| 27 |
+
segSNR = np.mean(segsnr_mean)
|
| 28 |
+
|
| 29 |
+
# Compute the PESQ
|
| 30 |
+
pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
|
| 31 |
+
|
| 32 |
+
# Cbak
|
| 33 |
+
Cbak = 1.634 + 0.478 * pesq_raw - 0.007 * wss_dist + 0.063 * segSNR
|
| 34 |
+
Cbak = trim_mos(Cbak)
|
| 35 |
+
|
| 36 |
+
return Cbak
|
| 37 |
+
|
scores/covl.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from basis import ScoreBasis
|
| 2 |
+
import numpy as np
|
| 3 |
+
from pesq import pesq
|
| 4 |
+
from scores.helper import wss, llr, SSNR, trim_mos
|
| 5 |
+
|
| 6 |
+
class COVL(ScoreBasis):
|
| 7 |
+
def __init__(self):
|
| 8 |
+
super(COVL, self).__init__(name='COVL')
|
| 9 |
+
self.score_rate = 16000
|
| 10 |
+
self.intrusive = False
|
| 11 |
+
|
| 12 |
+
def windowed_scoring(self, audios, score_rate):
|
| 13 |
+
if len(audios) != 2:
|
| 14 |
+
raise ValueError('COVL needs a reference and a test signals.')
|
| 15 |
+
return cal_COVL(audios[0], audios[1], score_rate)
|
| 16 |
+
|
| 17 |
+
def cal_COVL(target_wav, pred_wav, fs):
|
| 18 |
+
alpha = 0.95
|
| 19 |
+
|
| 20 |
+
# Compute WSS measure
|
| 21 |
+
wss_dist_vec = wss(target_wav, pred_wav, fs)
|
| 22 |
+
wss_dist_vec = sorted(wss_dist_vec, reverse=False)
|
| 23 |
+
wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
|
| 24 |
+
|
| 25 |
+
# Compute LLR measure
|
| 26 |
+
LLR_dist = llr(target_wav, pred_wav, fs)
|
| 27 |
+
LLR_dist = sorted(LLR_dist, reverse=False)
|
| 28 |
+
LLRs = LLR_dist
|
| 29 |
+
LLR_len = round(len(LLR_dist) * alpha)
|
| 30 |
+
llr_mean = np.mean(LLRs[:LLR_len])
|
| 31 |
+
|
| 32 |
+
# Compute the PESQ
|
| 33 |
+
pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
|
| 34 |
+
|
| 35 |
+
# Covl
|
| 36 |
+
Covl = 1.594 + 0.805 * pesq_raw - 0.512 * llr_mean - 0.007 * wss_dist
|
| 37 |
+
Covl = trim_mos(Covl)
|
| 38 |
+
|
| 39 |
+
return Covl
|
scores/csig.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from basis import ScoreBasis
|
| 2 |
+
import numpy as np
|
| 3 |
+
from pesq import pesq
|
| 4 |
+
from scores.helper import wss, llr, SSNR, trim_mos
|
| 5 |
+
|
| 6 |
+
class CSIG(ScoreBasis):
|
| 7 |
+
def __init__(self):
|
| 8 |
+
super(CSIG, self).__init__(name='CSIG')
|
| 9 |
+
self.score_rate = 16000
|
| 10 |
+
|
| 11 |
+
def windowed_scoring(self, audios, score_rate):
|
| 12 |
+
if len(audios) != 2:
|
| 13 |
+
raise ValueError('CSIG needs a reference and a test signals.')
|
| 14 |
+
return cal_CSIG(audios[0], audios[1], score_rate)
|
| 15 |
+
|
| 16 |
+
def cal_CSIG(target_wav, pred_wav, fs):
|
| 17 |
+
alpha = 0.95
|
| 18 |
+
|
| 19 |
+
# Compute WSS measure
|
| 20 |
+
wss_dist_vec = wss(target_wav, pred_wav, fs)
|
| 21 |
+
wss_dist_vec = sorted(wss_dist_vec, reverse=False)
|
| 22 |
+
wss_dist = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
|
| 23 |
+
|
| 24 |
+
# Compute LLR measure
|
| 25 |
+
LLR_dist = llr(target_wav, pred_wav, fs)
|
| 26 |
+
LLR_dist = sorted(LLR_dist, reverse=False)
|
| 27 |
+
LLRs = LLR_dist
|
| 28 |
+
LLR_len = round(len(LLR_dist) * alpha)
|
| 29 |
+
llr_mean = np.mean(LLRs[:LLR_len])
|
| 30 |
+
|
| 31 |
+
# Compute the PESQ
|
| 32 |
+
pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
|
| 33 |
+
|
| 34 |
+
# Csig
|
| 35 |
+
Csig = 3.093 - 1.029 * llr_mean + 0.603 * pesq_raw - 0.009 * wss_dist
|
| 36 |
+
Csig = float(trim_mos(Csig))
|
| 37 |
+
|
| 38 |
+
return Csig
|
scores/dnsmos/DNSMOS/bak_ovr.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f335c90994618150192a656a474bcf8a9cbcedbc47965494ba8da79605d1308
|
| 3 |
+
size 742375
|
scores/dnsmos/DNSMOS/model_v8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9246480c58567bc6affd4200938e77eef49468c8bc7ed3776d109c07456f6e91
|
| 3 |
+
size 224860
|
scores/dnsmos/DNSMOS/sig.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2fbdb293bc2366dfbae2b7477c490f981d24a8b4405efd3c11787569c6549d7
|
| 3 |
+
size 742203
|
scores/dnsmos/DNSMOS/sig_bak_ovr.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:269fbebdb513aa23cddfbb593542ecc540284a91849ac50516870e1ac78f6edd
|
| 3 |
+
size 1157965
|
scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc
ADDED
|
Binary file (3.63 kB). View file
|
|
|
scores/dnsmos/dnsmos.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import librosa
|
| 4 |
+
import numpy as np
|
| 5 |
+
import numpy.polynomial.polynomial as poly
|
| 6 |
+
import onnxruntime as ort
|
| 7 |
+
import soundfile as sf
|
| 8 |
+
|
| 9 |
+
SAMPLING_RATE = 16000
|
| 10 |
+
INPUT_LENGTH = 9.01
|
| 11 |
+
|
| 12 |
+
from basis import ScoreBasis
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class DNSMOS(ScoreBasis):
|
| 16 |
+
def __init__(self):
|
| 17 |
+
super(DNSMOS, self).__init__(name='DNSMOS')
|
| 18 |
+
self.intrusive = True
|
| 19 |
+
self.score_rate = 16000
|
| 20 |
+
self.p808_model_path = os.path.join('scores/dnsmos/DNSMOS', 'model_v8.onnx')
|
| 21 |
+
self.primary_model_path = os.path.join('scores/dnsmos/DNSMOS', 'sig_bak_ovr.onnx')
|
| 22 |
+
self.compute_score = ComputeScore(self.primary_model_path, self.p808_model_path)
|
| 23 |
+
|
| 24 |
+
def windowed_scoring(self, audios, rate):
|
| 25 |
+
if len(audios) == 2:
|
| 26 |
+
return self.compute_score.cal_mos(audios[1], rate)
|
| 27 |
+
else:
|
| 28 |
+
return self.compute_score.cal_mos(audios[0], rate)
|
| 29 |
+
|
| 30 |
+
class ComputeScore:
|
| 31 |
+
def __init__(self, primary_model_path, p808_model_path) -> None:
|
| 32 |
+
self.onnx_sess = ort.InferenceSession(primary_model_path)
|
| 33 |
+
self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
|
| 34 |
+
|
| 35 |
+
def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
|
| 36 |
+
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels)
|
| 37 |
+
if to_db:
|
| 38 |
+
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40
|
| 39 |
+
return mel_spec.T
|
| 40 |
+
|
| 41 |
+
def get_polyfit_val(self, sig, bak, ovr):
|
| 42 |
+
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
|
| 43 |
+
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439 ])
|
| 44 |
+
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
|
| 45 |
+
|
| 46 |
+
sig_poly = p_sig(sig)
|
| 47 |
+
bak_poly = p_bak(bak)
|
| 48 |
+
ovr_poly = p_ovr(ovr)
|
| 49 |
+
|
| 50 |
+
return sig_poly, bak_poly, ovr_poly
|
| 51 |
+
|
| 52 |
+
def cal_mos(self, audio, sampling_rate):
|
| 53 |
+
fs = sampling_rate
|
| 54 |
+
actual_audio_len = len(audio)
|
| 55 |
+
len_samples = int(INPUT_LENGTH*fs)
|
| 56 |
+
while len(audio) < len_samples:
|
| 57 |
+
audio = np.append(audio, audio)
|
| 58 |
+
|
| 59 |
+
num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1
|
| 60 |
+
hop_len_samples = fs
|
| 61 |
+
predicted_mos_sig_seg_raw = []
|
| 62 |
+
predicted_mos_bak_seg_raw = []
|
| 63 |
+
predicted_mos_ovr_seg_raw = []
|
| 64 |
+
predicted_mos_sig_seg = []
|
| 65 |
+
predicted_mos_bak_seg = []
|
| 66 |
+
predicted_mos_ovr_seg = []
|
| 67 |
+
predicted_p808_mos = []
|
| 68 |
+
|
| 69 |
+
for idx in range(num_hops):
|
| 70 |
+
audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)]
|
| 71 |
+
if len(audio_seg) < len_samples:
|
| 72 |
+
continue
|
| 73 |
+
|
| 74 |
+
input_features = np.array(audio_seg).astype('float32')[np.newaxis,:]
|
| 75 |
+
p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :]
|
| 76 |
+
oi = {'input_1': input_features}
|
| 77 |
+
p808_oi = {'input_1': p808_input_features}
|
| 78 |
+
p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
|
| 79 |
+
mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
|
| 80 |
+
mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw,mos_bak_raw,mos_ovr_raw)
|
| 81 |
+
predicted_mos_sig_seg_raw.append(mos_sig_raw)
|
| 82 |
+
predicted_mos_bak_seg_raw.append(mos_bak_raw)
|
| 83 |
+
predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
|
| 84 |
+
predicted_mos_sig_seg.append(mos_sig)
|
| 85 |
+
predicted_mos_bak_seg.append(mos_bak)
|
| 86 |
+
predicted_mos_ovr_seg.append(mos_ovr)
|
| 87 |
+
predicted_p808_mos.append(p808_mos)
|
| 88 |
+
|
| 89 |
+
results = {}
|
| 90 |
+
results['OVRL'] = np.mean(predicted_mos_ovr_seg)
|
| 91 |
+
results['SIG'] = np.mean(predicted_mos_sig_seg)
|
| 92 |
+
results['BAK'] = np.mean(predicted_mos_bak_seg)
|
| 93 |
+
results['P808_MOS'] = np.mean(predicted_p808_mos)
|
| 94 |
+
return results
|
scores/fwsegsnr.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import librosa
|
| 2 |
+
import numpy as np
|
| 3 |
+
from basis import ScoreBasis
|
| 4 |
+
|
| 5 |
+
class FWSEGSNR(ScoreBasis):
|
| 6 |
+
def __init__(self):
|
| 7 |
+
super(FWSEGSNR, self).__init__(name='FWSEGSNR')
|
| 8 |
+
self.intrusive = False
|
| 9 |
+
|
| 10 |
+
def windowed_scoring(self, audios, score_rate):
|
| 11 |
+
if len(audios) != 2:
|
| 12 |
+
raise ValueError('FWSEGSNR needs a reference and a test signals.')
|
| 13 |
+
return fwsegsnr(audios[1], audios[0], score_rate)
|
| 14 |
+
|
| 15 |
+
def fwsegsnr(x, y, fs, frame_sz = 0.025, shift_sz= 0.01, win='hann', numband=23):
|
| 16 |
+
epsilon = np.finfo(np.float32).eps
|
| 17 |
+
frame = int(np.fix(frame_sz * fs))
|
| 18 |
+
shift = int(np.fix(shift_sz * fs))
|
| 19 |
+
window = win
|
| 20 |
+
nband = numband
|
| 21 |
+
noverlap = frame - shift
|
| 22 |
+
fftpt = int(2**np.ceil(np.log2(np.abs(frame))))
|
| 23 |
+
x = x / np.sqrt(sum(np.power(x, 2)))
|
| 24 |
+
y = y / np.sqrt(sum(np.power(y, 2)))
|
| 25 |
+
|
| 26 |
+
assert len(x) == len(y), print('Wav length are not matched!')
|
| 27 |
+
X_stft = np.abs(librosa.stft(x, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False))
|
| 28 |
+
Y_stft = np.abs(librosa.stft(y, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False))
|
| 29 |
+
|
| 30 |
+
num_freq = X_stft.shape[0]
|
| 31 |
+
num_frame = X_stft.shape[1]
|
| 32 |
+
|
| 33 |
+
X_mel = librosa.feature.melspectrogram(S=X_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2)
|
| 34 |
+
Y_mel = librosa.feature.melspectrogram(S=Y_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2)
|
| 35 |
+
|
| 36 |
+
# Calculate SNR.
|
| 37 |
+
|
| 38 |
+
W = np.power(Y_mel, 0.2)
|
| 39 |
+
E = X_mel - Y_mel
|
| 40 |
+
E[E == 0.0] = epsilon
|
| 41 |
+
E_power = np.power(E, 2)
|
| 42 |
+
Y_div_E = np.divide((np.power(Y_mel,2)), (np.power(E,2)))
|
| 43 |
+
Y_div_E[Y_div_E==0] = epsilon
|
| 44 |
+
ds = 10 * np.divide(np.sum(np.multiply(W, np.log10(Y_div_E)), 1), np.sum(W, 1))
|
| 45 |
+
ds[ds > 35] = 35
|
| 46 |
+
ds[ds < -10] = -10
|
| 47 |
+
d = np.mean(ds)
|
| 48 |
+
return d
|
| 49 |
+
|
scores/helper.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modifications in Metrics
|
| 3 |
+
|
| 4 |
+
# Original copyright:
|
| 5 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 6 |
+
# Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez
|
| 7 |
+
"""
|
| 8 |
+
import numpy as np
|
| 9 |
+
from scipy.linalg import toeplitz
|
| 10 |
+
|
| 11 |
+
# ----------------------------- HELPERS ------------------------------------ #
|
| 12 |
+
def trim_mos(val):
|
| 13 |
+
return min(max(val, 1), 5)
|
| 14 |
+
|
| 15 |
+
def lpcoeff(speech_frame, model_order):
|
| 16 |
+
# (1) Compute Autocor lags
|
| 17 |
+
winlength = speech_frame.shape[0]
|
| 18 |
+
R = []
|
| 19 |
+
for k in range(model_order + 1):
|
| 20 |
+
first = speech_frame[:(winlength - k)]
|
| 21 |
+
second = speech_frame[k:winlength]
|
| 22 |
+
R.append(np.sum(first * second))
|
| 23 |
+
|
| 24 |
+
# (2) Lev-Durbin
|
| 25 |
+
a = np.ones((model_order,))
|
| 26 |
+
E = np.zeros((model_order + 1,))
|
| 27 |
+
rcoeff = np.zeros((model_order,))
|
| 28 |
+
E[0] = R[0]
|
| 29 |
+
for i in range(model_order):
|
| 30 |
+
if i == 0:
|
| 31 |
+
sum_term = 0
|
| 32 |
+
else:
|
| 33 |
+
a_past = a[:i]
|
| 34 |
+
sum_term = np.sum(a_past * np.array(R[i:0:-1]))
|
| 35 |
+
rcoeff[i] = (R[i+1] - sum_term)/E[i]
|
| 36 |
+
a[i] = rcoeff[i]
|
| 37 |
+
if i > 0:
|
| 38 |
+
a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1]
|
| 39 |
+
E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i]
|
| 40 |
+
acorr = np.array(R, dtype=np.float32)
|
| 41 |
+
refcoeff = np.array(rcoeff, dtype=np.float32)
|
| 42 |
+
a = a * -1
|
| 43 |
+
lpparams = np.array([1] + list(a), dtype=np.float32)
|
| 44 |
+
acorr = np.array(acorr, dtype=np.float32)
|
| 45 |
+
refcoeff = np.array(refcoeff, dtype=np.float32)
|
| 46 |
+
lpparams = np.array(lpparams, dtype=np.float32)
|
| 47 |
+
|
| 48 |
+
return acorr, refcoeff, lpparams
|
| 49 |
+
# -------------------------------------------------------------------------- #
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
|
| 53 |
+
""" Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
|
| 54 |
+
This function implements the segmental signal-to-noise ratio
|
| 55 |
+
as defined in [1, p. 45] (see Equation 2.12).
|
| 56 |
+
"""
|
| 57 |
+
clean_speech = ref_wav
|
| 58 |
+
processed_speech = deg_wav
|
| 59 |
+
clean_length = ref_wav.shape[0]
|
| 60 |
+
processed_length = deg_wav.shape[0]
|
| 61 |
+
|
| 62 |
+
# scale both to have same dynamic range. Remove DC too.
|
| 63 |
+
clean_speech -= clean_speech.mean()
|
| 64 |
+
processed_speech -= processed_speech.mean()
|
| 65 |
+
processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
|
| 66 |
+
|
| 67 |
+
# Signal-to-Noise Ratio
|
| 68 |
+
dif = ref_wav - deg_wav
|
| 69 |
+
overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) +
|
| 70 |
+
10e-20))
|
| 71 |
+
# global variables
|
| 72 |
+
winlength = int(np.round(30 * srate / 1000)) # 30 msecs
|
| 73 |
+
skiprate = winlength // 4
|
| 74 |
+
MIN_SNR = -10
|
| 75 |
+
MAX_SNR = 35
|
| 76 |
+
|
| 77 |
+
# For each frame, calculate SSNR
|
| 78 |
+
num_frames = int(clean_length / skiprate - (winlength/skiprate))
|
| 79 |
+
start = 0
|
| 80 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
| 81 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
| 82 |
+
segmental_snr = []
|
| 83 |
+
|
| 84 |
+
for frame_count in range(int(num_frames)):
|
| 85 |
+
# (1) get the frames for the test and ref speech.
|
| 86 |
+
# Apply Hanning Window
|
| 87 |
+
clean_frame = clean_speech[start:start+winlength]
|
| 88 |
+
processed_frame = processed_speech[start:start+winlength]
|
| 89 |
+
clean_frame = clean_frame * window
|
| 90 |
+
processed_frame = processed_frame * window
|
| 91 |
+
|
| 92 |
+
# (2) Compute Segmental SNR
|
| 93 |
+
signal_energy = np.sum(clean_frame ** 2)
|
| 94 |
+
noise_energy = np.sum((clean_frame - processed_frame) ** 2)
|
| 95 |
+
segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
|
| 96 |
+
segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
|
| 97 |
+
segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
|
| 98 |
+
start += int(skiprate)
|
| 99 |
+
return overall_snr, segmental_snr
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def wss(ref_wav, deg_wav, srate):
|
| 103 |
+
clean_speech = ref_wav
|
| 104 |
+
processed_speech = deg_wav
|
| 105 |
+
clean_length = ref_wav.shape[0]
|
| 106 |
+
processed_length = deg_wav.shape[0]
|
| 107 |
+
|
| 108 |
+
assert clean_length == processed_length, clean_length
|
| 109 |
+
|
| 110 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
| 111 |
+
skiprate = np.floor(winlength / 4)
|
| 112 |
+
max_freq = srate / 2
|
| 113 |
+
num_crit = 25 # num of critical bands
|
| 114 |
+
|
| 115 |
+
USE_FFT_SPECTRUM = 1
|
| 116 |
+
n_fft = int(2 ** np.ceil(np.log(2*winlength)/np.log(2)))
|
| 117 |
+
n_fftby2 = int(n_fft / 2)
|
| 118 |
+
Kmax = 20
|
| 119 |
+
Klocmax = 1
|
| 120 |
+
|
| 121 |
+
# Critical band filter definitions (Center frequency and BW in Hz)
|
| 122 |
+
cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372,
|
| 123 |
+
703.378, 798.717, 904.128, 1020.38, 1148.30,
|
| 124 |
+
1288.72, 1442.54, 1610.70, 1794.16, 1993.93,
|
| 125 |
+
2211.08, 2446.71, 2701.97, 2978.04, 3276.17,
|
| 126 |
+
3597.63]
|
| 127 |
+
bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056,
|
| 128 |
+
95.3398, 105.411, 116.256, 127.914, 140.423,
|
| 129 |
+
153.823, 168.154, 183.457, 199.776, 217.153,
|
| 130 |
+
235.631, 255.255, 276.072, 298.126, 321.465,
|
| 131 |
+
346.136]
|
| 132 |
+
|
| 133 |
+
bw_min = bandwidth[0] # min critical bandwidth
|
| 134 |
+
|
| 135 |
+
# set up critical band filters. Note here that Gaussianly shaped filters
|
| 136 |
+
# are used. Also, the sum of the filter weights are equivalent for each
|
| 137 |
+
# critical band filter. Filter less than -30 dB and set to zero.
|
| 138 |
+
min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter
|
| 139 |
+
|
| 140 |
+
crit_filter = np.zeros((num_crit, n_fftby2))
|
| 141 |
+
all_f0 = []
|
| 142 |
+
for i in range(num_crit):
|
| 143 |
+
f0 = (cent_freq[i] / max_freq) * (n_fftby2)
|
| 144 |
+
all_f0.append(np.floor(f0))
|
| 145 |
+
bw = (bandwidth[i] / max_freq) * (n_fftby2)
|
| 146 |
+
norm_factor = np.log(bw_min) - np.log(bandwidth[i])
|
| 147 |
+
j = list(range(n_fftby2))
|
| 148 |
+
crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \
|
| 149 |
+
norm_factor)
|
| 150 |
+
crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \
|
| 151 |
+
min_factor)
|
| 152 |
+
|
| 153 |
+
# For each frame of input speech, compute Weighted Spectral Slope Measure
|
| 154 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
| 155 |
+
start = 0 # starting sample
|
| 156 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
| 157 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
| 158 |
+
distortion = []
|
| 159 |
+
|
| 160 |
+
for frame_count in range(num_frames):
|
| 161 |
+
# (1) Get the Frames for the test and reference speeech.
|
| 162 |
+
# Multiply by Hanning window.
|
| 163 |
+
clean_frame = clean_speech[start:start+winlength]
|
| 164 |
+
processed_frame = processed_speech[start:start+winlength]
|
| 165 |
+
clean_frame = clean_frame * window
|
| 166 |
+
processed_frame = processed_frame * window
|
| 167 |
+
|
| 168 |
+
# (2) Compuet Power Spectrum of clean and processed
|
| 169 |
+
clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2)
|
| 170 |
+
processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2)
|
| 171 |
+
clean_energy = [None] * num_crit
|
| 172 |
+
processed_energy = [None] * num_crit
|
| 173 |
+
|
| 174 |
+
# (3) Compute Filterbank output energies (in dB)
|
| 175 |
+
for i in range(num_crit):
|
| 176 |
+
clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \
|
| 177 |
+
crit_filter[i, :])
|
| 178 |
+
processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \
|
| 179 |
+
crit_filter[i, :])
|
| 180 |
+
clean_energy = np.array(clean_energy).reshape(-1, 1)
|
| 181 |
+
eps = np.ones((clean_energy.shape[0], 1)) * 1e-10
|
| 182 |
+
clean_energy = np.concatenate((clean_energy, eps), axis=1)
|
| 183 |
+
clean_energy = 10 * np.log10(np.max(clean_energy, axis=1))
|
| 184 |
+
processed_energy = np.array(processed_energy).reshape(-1, 1)
|
| 185 |
+
processed_energy = np.concatenate((processed_energy, eps), axis=1)
|
| 186 |
+
processed_energy = 10 * np.log10(np.max(processed_energy, axis=1))
|
| 187 |
+
|
| 188 |
+
# (4) Compute Spectral Shape (dB[i+1] - dB[i])
|
| 189 |
+
clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1]
|
| 190 |
+
processed_slope = processed_energy[1:num_crit] - \
|
| 191 |
+
processed_energy[:num_crit-1]
|
| 192 |
+
|
| 193 |
+
# (5) Find the nearest peak locations in the spectra to each
|
| 194 |
+
# critical band. If the slope is negative, we search
|
| 195 |
+
# to the left. If positive, we search to the right.
|
| 196 |
+
clean_loc_peak = []
|
| 197 |
+
processed_loc_peak = []
|
| 198 |
+
for i in range(num_crit - 1):
|
| 199 |
+
if clean_slope[i] > 0:
|
| 200 |
+
# search to the right
|
| 201 |
+
n = i
|
| 202 |
+
while n < num_crit - 1 and clean_slope[n] > 0:
|
| 203 |
+
n += 1
|
| 204 |
+
clean_loc_peak.append(clean_energy[n - 1])
|
| 205 |
+
else:
|
| 206 |
+
# search to the left
|
| 207 |
+
n = i
|
| 208 |
+
while n >= 0 and clean_slope[n] <= 0:
|
| 209 |
+
n -= 1
|
| 210 |
+
clean_loc_peak.append(clean_energy[n + 1])
|
| 211 |
+
# find the peaks in the processed speech signal
|
| 212 |
+
if processed_slope[i] > 0:
|
| 213 |
+
n = i
|
| 214 |
+
while n < num_crit - 1 and processed_slope[n] > 0:
|
| 215 |
+
n += 1
|
| 216 |
+
processed_loc_peak.append(processed_energy[n - 1])
|
| 217 |
+
else:
|
| 218 |
+
n = i
|
| 219 |
+
while n >= 0 and processed_slope[n] <= 0:
|
| 220 |
+
n -= 1
|
| 221 |
+
processed_loc_peak.append(processed_energy[n + 1])
|
| 222 |
+
|
| 223 |
+
# (6) Compuet the WSS Measure for this frame. This includes
|
| 224 |
+
# determination of the weighting functino
|
| 225 |
+
dBMax_clean = max(clean_energy)
|
| 226 |
+
dBMax_processed = max(processed_energy)
|
| 227 |
+
|
| 228 |
+
# The weights are calculated by averaging individual
|
| 229 |
+
# weighting factors from the clean and processed frame.
|
| 230 |
+
# These weights W_clean and W_processed should range
|
| 231 |
+
# from 0 to 1 and place more emphasis on spectral
|
| 232 |
+
# peaks and less emphasis on slope differences in spectral
|
| 233 |
+
# valleys. This procedure is described on page 1280 of
|
| 234 |
+
# Klatt's 1982 ICASSP paper.
|
| 235 |
+
clean_loc_peak = np.array(clean_loc_peak)
|
| 236 |
+
processed_loc_peak = np.array(processed_loc_peak)
|
| 237 |
+
Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1])
|
| 238 |
+
Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \
|
| 239 |
+
clean_energy[:num_crit-1])
|
| 240 |
+
W_clean = Wmax_clean * Wlocmax_clean
|
| 241 |
+
Wmax_processed = Kmax / (Kmax + dBMax_processed - \
|
| 242 |
+
processed_energy[:num_crit-1])
|
| 243 |
+
Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \
|
| 244 |
+
processed_energy[:num_crit-1])
|
| 245 |
+
W_processed = Wmax_processed * Wlocmax_processed
|
| 246 |
+
W = (W_clean + W_processed) / 2
|
| 247 |
+
distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \
|
| 248 |
+
processed_slope[:num_crit - 1]) ** 2))
|
| 249 |
+
|
| 250 |
+
# this normalization is not part of Klatt's paper, but helps
|
| 251 |
+
# to normalize the meaasure. Here we scale the measure by the sum of the
|
| 252 |
+
# weights
|
| 253 |
+
distortion[frame_count] = distortion[frame_count] / np.sum(W)
|
| 254 |
+
start += int(skiprate)
|
| 255 |
+
return distortion
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def llr(ref_wav, deg_wav, srate):
|
| 259 |
+
clean_speech = ref_wav
|
| 260 |
+
processed_speech = deg_wav
|
| 261 |
+
clean_length = ref_wav.shape[0]
|
| 262 |
+
processed_length = deg_wav.shape[0]
|
| 263 |
+
assert clean_length == processed_length, clean_length
|
| 264 |
+
|
| 265 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
| 266 |
+
skiprate = np.floor(winlength / 4)
|
| 267 |
+
if srate < 10000:
|
| 268 |
+
# LPC analysis order
|
| 269 |
+
P = 10
|
| 270 |
+
else:
|
| 271 |
+
P = 16
|
| 272 |
+
|
| 273 |
+
# For each frame of input speech, calculate the Log Likelihood Ratio
|
| 274 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
| 275 |
+
start = 0
|
| 276 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
| 277 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
| 278 |
+
distortion = []
|
| 279 |
+
|
| 280 |
+
for frame_count in range(num_frames):
|
| 281 |
+
# (1) Get the Frames for the test and reference speeech.
|
| 282 |
+
# Multiply by Hanning window.
|
| 283 |
+
clean_frame = clean_speech[start:start+winlength]
|
| 284 |
+
processed_frame = processed_speech[start:start+winlength]
|
| 285 |
+
clean_frame = clean_frame * window
|
| 286 |
+
processed_frame = processed_frame * window
|
| 287 |
+
|
| 288 |
+
# (2) Get the autocorrelation logs and LPC params used
|
| 289 |
+
# to compute the LLR measure
|
| 290 |
+
R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
|
| 291 |
+
R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
|
| 292 |
+
A_clean = A_clean[None, :]
|
| 293 |
+
A_processed = A_processed[None, :]
|
| 294 |
+
|
| 295 |
+
# (3) Compute the LLR measure
|
| 296 |
+
numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
|
| 297 |
+
denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
|
| 298 |
+
|
| 299 |
+
if (numerator/denominator) <= 0:
|
| 300 |
+
print(f'Numerator: {numerator}')
|
| 301 |
+
print(f'Denominator: {denominator}')
|
| 302 |
+
|
| 303 |
+
log_ = np.log(numerator / denominator)
|
| 304 |
+
distortion.append(np.squeeze(log_))
|
| 305 |
+
start += int(skiprate)
|
| 306 |
+
return np.nan_to_num(np.array(distortion))
|
| 307 |
+
# -------------------------------------------------------------------------- #
|
scores/helper_bk.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modifications in Metrics
|
| 3 |
+
|
| 4 |
+
# Original copyright:
|
| 5 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 6 |
+
# Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez
|
| 7 |
+
"""
|
| 8 |
+
import numpy as np
|
| 9 |
+
from scipy.linalg import toeplitz
|
| 10 |
+
|
| 11 |
+
# ----------------------------- HELPERS ------------------------------------ #
|
| 12 |
+
def trim_mos(val):
|
| 13 |
+
return min(max(val, 1), 5)
|
| 14 |
+
|
| 15 |
+
def lpcoeff(speech_frame, model_order):
|
| 16 |
+
# (1) Compute Autocor lags
|
| 17 |
+
winlength = speech_frame.shape[0]
|
| 18 |
+
R = []
|
| 19 |
+
for k in range(model_order + 1):
|
| 20 |
+
first = speech_frame[:(winlength - k)]
|
| 21 |
+
second = speech_frame[k:winlength]
|
| 22 |
+
R.append(np.sum(first * second))
|
| 23 |
+
|
| 24 |
+
# (2) Lev-Durbin
|
| 25 |
+
a = np.ones((model_order,))
|
| 26 |
+
E = np.zeros((model_order + 1,))
|
| 27 |
+
rcoeff = np.zeros((model_order,))
|
| 28 |
+
E[0] = R[0]
|
| 29 |
+
for i in range(model_order):
|
| 30 |
+
if i == 0:
|
| 31 |
+
sum_term = 0
|
| 32 |
+
else:
|
| 33 |
+
a_past = a[:i]
|
| 34 |
+
sum_term = np.sum(a_past * np.array(R[i:0:-1]))
|
| 35 |
+
rcoeff[i] = (R[i+1] - sum_term)/E[i]
|
| 36 |
+
a[i] = rcoeff[i]
|
| 37 |
+
if i > 0:
|
| 38 |
+
a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1]
|
| 39 |
+
E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i]
|
| 40 |
+
acorr = np.array(R, dtype=np.float32)
|
| 41 |
+
refcoeff = np.array(rcoeff, dtype=np.float32)
|
| 42 |
+
a = a * -1
|
| 43 |
+
lpparams = np.array([1] + list(a), dtype=np.float32)
|
| 44 |
+
acorr = np.array(acorr, dtype=np.float32)
|
| 45 |
+
refcoeff = np.array(refcoeff, dtype=np.float32)
|
| 46 |
+
lpparams = np.array(lpparams, dtype=np.float32)
|
| 47 |
+
|
| 48 |
+
return acorr, refcoeff, lpparams
|
| 49 |
+
# -------------------------------------------------------------------------- #
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
|
| 53 |
+
""" Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
|
| 54 |
+
This function implements the segmental signal-to-noise ratio
|
| 55 |
+
as defined in [1, p. 45] (see Equation 2.12).
|
| 56 |
+
"""
|
| 57 |
+
clean_speech = ref_wav
|
| 58 |
+
processed_speech = deg_wav
|
| 59 |
+
clean_length = ref_wav.shape[0]
|
| 60 |
+
processed_length = deg_wav.shape[0]
|
| 61 |
+
|
| 62 |
+
# scale both to have same dynamic range. Remove DC too.
|
| 63 |
+
clean_speech -= clean_speech.mean()
|
| 64 |
+
processed_speech -= processed_speech.mean()
|
| 65 |
+
processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
|
| 66 |
+
|
| 67 |
+
# Signal-to-Noise Ratio
|
| 68 |
+
dif = ref_wav - deg_wav
|
| 69 |
+
overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) +
|
| 70 |
+
10e-20))
|
| 71 |
+
# global variables
|
| 72 |
+
winlength = int(np.round(30 * srate / 1000)) # 30 msecs
|
| 73 |
+
skiprate = winlength // 4
|
| 74 |
+
MIN_SNR = -10
|
| 75 |
+
MAX_SNR = 35
|
| 76 |
+
|
| 77 |
+
# For each frame, calculate SSNR
|
| 78 |
+
num_frames = int(clean_length / skiprate - (winlength/skiprate))
|
| 79 |
+
start = 0
|
| 80 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
| 81 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
| 82 |
+
segmental_snr = []
|
| 83 |
+
|
| 84 |
+
for frame_count in range(int(num_frames)):
|
| 85 |
+
# (1) get the frames for the test and ref speech.
|
| 86 |
+
# Apply Hanning Window
|
| 87 |
+
clean_frame = clean_speech[start:start+winlength]
|
| 88 |
+
processed_frame = processed_speech[start:start+winlength]
|
| 89 |
+
clean_frame = clean_frame * window
|
| 90 |
+
processed_frame = processed_frame * window
|
| 91 |
+
|
| 92 |
+
# (2) Compute Segmental SNR
|
| 93 |
+
signal_energy = np.sum(clean_frame ** 2)
|
| 94 |
+
noise_energy = np.sum((clean_frame - processed_frame) ** 2)
|
| 95 |
+
segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
|
| 96 |
+
segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
|
| 97 |
+
segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
|
| 98 |
+
start += int(skiprate)
|
| 99 |
+
return overall_snr, segmental_snr
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def wss(ref_wav, deg_wav, srate):
|
| 103 |
+
clean_speech = ref_wav
|
| 104 |
+
processed_speech = deg_wav
|
| 105 |
+
clean_length = ref_wav.shape[0]
|
| 106 |
+
processed_length = deg_wav.shape[0]
|
| 107 |
+
|
| 108 |
+
assert clean_length == processed_length, clean_length
|
| 109 |
+
|
| 110 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
| 111 |
+
skiprate = np.floor(winlength / 4)
|
| 112 |
+
max_freq = srate / 2
|
| 113 |
+
num_crit = 25 # num of critical bands
|
| 114 |
+
|
| 115 |
+
USE_FFT_SPECTRUM = 1
|
| 116 |
+
n_fft = int(2 ** np.ceil(np.log(2*winlength)/np.log(2)))
|
| 117 |
+
n_fftby2 = int(n_fft / 2)
|
| 118 |
+
Kmax = 20
|
| 119 |
+
Klocmax = 1
|
| 120 |
+
|
| 121 |
+
# Critical band filter definitions (Center frequency and BW in Hz)
|
| 122 |
+
cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372,
|
| 123 |
+
703.378, 798.717, 904.128, 1020.38, 1148.30,
|
| 124 |
+
1288.72, 1442.54, 1610.70, 1794.16, 1993.93,
|
| 125 |
+
2211.08, 2446.71, 2701.97, 2978.04, 3276.17,
|
| 126 |
+
3597.63]
|
| 127 |
+
bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056,
|
| 128 |
+
95.3398, 105.411, 116.256, 127.914, 140.423,
|
| 129 |
+
153.823, 168.154, 183.457, 199.776, 217.153,
|
| 130 |
+
235.631, 255.255, 276.072, 298.126, 321.465,
|
| 131 |
+
346.136]
|
| 132 |
+
|
| 133 |
+
bw_min = bandwidth[0] # min critical bandwidth
|
| 134 |
+
|
| 135 |
+
# set up critical band filters. Note here that Gaussianly shaped filters
|
| 136 |
+
# are used. Also, the sum of the filter weights are equivalent for each
|
| 137 |
+
# critical band filter. Filter less than -30 dB and set to zero.
|
| 138 |
+
min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter
|
| 139 |
+
|
| 140 |
+
crit_filter = np.zeros((num_crit, n_fftby2))
|
| 141 |
+
all_f0 = []
|
| 142 |
+
for i in range(num_crit):
|
| 143 |
+
f0 = (cent_freq[i] / max_freq) * (n_fftby2)
|
| 144 |
+
all_f0.append(np.floor(f0))
|
| 145 |
+
bw = (bandwidth[i] / max_freq) * (n_fftby2)
|
| 146 |
+
norm_factor = np.log(bw_min) - np.log(bandwidth[i])
|
| 147 |
+
j = list(range(n_fftby2))
|
| 148 |
+
crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \
|
| 149 |
+
norm_factor)
|
| 150 |
+
crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \
|
| 151 |
+
min_factor)
|
| 152 |
+
|
| 153 |
+
# For each frame of input speech, compute Weighted Spectral Slope Measure
|
| 154 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
| 155 |
+
start = 0 # starting sample
|
| 156 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
| 157 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
| 158 |
+
distortion = []
|
| 159 |
+
|
| 160 |
+
for frame_count in range(num_frames):
|
| 161 |
+
# (1) Get the Frames for the test and reference speeech.
|
| 162 |
+
# Multiply by Hanning window.
|
| 163 |
+
clean_frame = clean_speech[start:start+winlength]
|
| 164 |
+
processed_frame = processed_speech[start:start+winlength]
|
| 165 |
+
clean_frame = clean_frame * window
|
| 166 |
+
processed_frame = processed_frame * window
|
| 167 |
+
|
| 168 |
+
# (2) Compuet Power Spectrum of clean and processed
|
| 169 |
+
clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2)
|
| 170 |
+
processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2)
|
| 171 |
+
clean_energy = [None] * num_crit
|
| 172 |
+
processed_energy = [None] * num_crit
|
| 173 |
+
|
| 174 |
+
# (3) Compute Filterbank output energies (in dB)
|
| 175 |
+
for i in range(num_crit):
|
| 176 |
+
clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \
|
| 177 |
+
crit_filter[i, :])
|
| 178 |
+
processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \
|
| 179 |
+
crit_filter[i, :])
|
| 180 |
+
clean_energy = np.array(clean_energy).reshape(-1, 1)
|
| 181 |
+
eps = np.ones((clean_energy.shape[0], 1)) * 1e-10
|
| 182 |
+
clean_energy = np.concatenate((clean_energy, eps), axis=1)
|
| 183 |
+
clean_energy = 10 * np.log10(np.max(clean_energy, axis=1))
|
| 184 |
+
processed_energy = np.array(processed_energy).reshape(-1, 1)
|
| 185 |
+
processed_energy = np.concatenate((processed_energy, eps), axis=1)
|
| 186 |
+
processed_energy = 10 * np.log10(np.max(processed_energy, axis=1))
|
| 187 |
+
|
| 188 |
+
# (4) Compute Spectral Shape (dB[i+1] - dB[i])
|
| 189 |
+
clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1]
|
| 190 |
+
processed_slope = processed_energy[1:num_crit] - \
|
| 191 |
+
processed_energy[:num_crit-1]
|
| 192 |
+
|
| 193 |
+
# (5) Find the nearest peak locations in the spectra to each
|
| 194 |
+
# critical band. If the slope is negative, we search
|
| 195 |
+
# to the left. If positive, we search to the right.
|
| 196 |
+
clean_loc_peak = []
|
| 197 |
+
processed_loc_peak = []
|
| 198 |
+
for i in range(num_crit - 1):
|
| 199 |
+
if clean_slope[i] > 0:
|
| 200 |
+
# search to the right
|
| 201 |
+
n = i
|
| 202 |
+
while n < num_crit - 1 and clean_slope[n] > 0:
|
| 203 |
+
n += 1
|
| 204 |
+
clean_loc_peak.append(clean_energy[n - 1])
|
| 205 |
+
else:
|
| 206 |
+
# search to the left
|
| 207 |
+
n = i
|
| 208 |
+
while n >= 0 and clean_slope[n] <= 0:
|
| 209 |
+
n -= 1
|
| 210 |
+
clean_loc_peak.append(clean_energy[n + 1])
|
| 211 |
+
# find the peaks in the processed speech signal
|
| 212 |
+
if processed_slope[i] > 0:
|
| 213 |
+
n = i
|
| 214 |
+
while n < num_crit - 1 and processed_slope[n] > 0:
|
| 215 |
+
n += 1
|
| 216 |
+
processed_loc_peak.append(processed_energy[n - 1])
|
| 217 |
+
else:
|
| 218 |
+
n = i
|
| 219 |
+
while n >= 0 and processed_slope[n] <= 0:
|
| 220 |
+
n -= 1
|
| 221 |
+
processed_loc_peak.append(processed_energy[n + 1])
|
| 222 |
+
|
| 223 |
+
# (6) Compuet the WSS Measure for this frame. This includes
|
| 224 |
+
# determination of the weighting functino
|
| 225 |
+
dBMax_clean = max(clean_energy)
|
| 226 |
+
dBMax_processed = max(processed_energy)
|
| 227 |
+
|
| 228 |
+
# The weights are calculated by averaging individual
|
| 229 |
+
# weighting factors from the clean and processed frame.
|
| 230 |
+
# These weights W_clean and W_processed should range
|
| 231 |
+
# from 0 to 1 and place more emphasis on spectral
|
| 232 |
+
# peaks and less emphasis on slope differences in spectral
|
| 233 |
+
# valleys. This procedure is described on page 1280 of
|
| 234 |
+
# Klatt's 1982 ICASSP paper.
|
| 235 |
+
clean_loc_peak = np.array(clean_loc_peak)
|
| 236 |
+
processed_loc_peak = np.array(processed_loc_peak)
|
| 237 |
+
Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1])
|
| 238 |
+
Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \
|
| 239 |
+
clean_energy[:num_crit-1])
|
| 240 |
+
W_clean = Wmax_clean * Wlocmax_clean
|
| 241 |
+
Wmax_processed = Kmax / (Kmax + dBMax_processed - \
|
| 242 |
+
processed_energy[:num_crit-1])
|
| 243 |
+
Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \
|
| 244 |
+
processed_energy[:num_crit-1])
|
| 245 |
+
W_processed = Wmax_processed * Wlocmax_processed
|
| 246 |
+
W = (W_clean + W_processed) / 2
|
| 247 |
+
distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \
|
| 248 |
+
processed_slope[:num_crit - 1]) ** 2))
|
| 249 |
+
|
| 250 |
+
# this normalization is not part of Klatt's paper, but helps
|
| 251 |
+
# to normalize the meaasure. Here we scale the measure by the sum of the
|
| 252 |
+
# weights
|
| 253 |
+
distortion[frame_count] = distortion[frame_count] / np.sum(W)
|
| 254 |
+
start += int(skiprate)
|
| 255 |
+
return distortion
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def llr(ref_wav, deg_wav, srate):
|
| 259 |
+
clean_speech = ref_wav
|
| 260 |
+
processed_speech = deg_wav
|
| 261 |
+
clean_length = ref_wav.shape[0]
|
| 262 |
+
processed_length = deg_wav.shape[0]
|
| 263 |
+
assert clean_length == processed_length, clean_length
|
| 264 |
+
|
| 265 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
| 266 |
+
skiprate = np.floor(winlength / 4)
|
| 267 |
+
if srate < 10000:
|
| 268 |
+
# LPC analysis order
|
| 269 |
+
P = 10
|
| 270 |
+
else:
|
| 271 |
+
P = 16
|
| 272 |
+
|
| 273 |
+
# For each frame of input speech, calculate the Log Likelihood Ratio
|
| 274 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
| 275 |
+
start = 0
|
| 276 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
| 277 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
| 278 |
+
distortion = []
|
| 279 |
+
|
| 280 |
+
for frame_count in range(num_frames):
|
| 281 |
+
# (1) Get the Frames for the test and reference speeech.
|
| 282 |
+
# Multiply by Hanning window.
|
| 283 |
+
clean_frame = clean_speech[start:start+winlength]
|
| 284 |
+
processed_frame = processed_speech[start:start+winlength]
|
| 285 |
+
clean_frame = clean_frame * window
|
| 286 |
+
processed_frame = processed_frame * window
|
| 287 |
+
|
| 288 |
+
# (2) Get the autocorrelation logs and LPC params used
|
| 289 |
+
# to compute the LLR measure
|
| 290 |
+
R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
|
| 291 |
+
R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
|
| 292 |
+
A_clean = A_clean[None, :]
|
| 293 |
+
A_processed = A_processed[None, :]
|
| 294 |
+
|
| 295 |
+
# (3) Compute the LLR measure
|
| 296 |
+
numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
|
| 297 |
+
denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
|
| 298 |
+
|
| 299 |
+
if (numerator/denominator) <= 0:
|
| 300 |
+
print(f'Numerator: {numerator}')
|
| 301 |
+
print(f'Denominator: {denominator}')
|
| 302 |
+
|
| 303 |
+
log_ = np.log(numerator / denominator)
|
| 304 |
+
distortion.append(np.squeeze(log_))
|
| 305 |
+
start += int(skiprate)
|
| 306 |
+
return np.nan_to_num(np.array(distortion))
|
| 307 |
+
# -------------------------------------------------------------------------- #
|
| 308 |
+
|
| 309 |
+
#!/usr/bin/env python3
|
| 310 |
+
|
| 311 |
+
# Copyright 2020 Wen-Chin Huang and Tomoki Hayashi
|
| 312 |
+
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
| 313 |
+
# ported from https://github.com/espnet/espnet/blob/master/utils/mcd_calculate.py
|
| 314 |
+
|
| 315 |
+
"""Evaluate MCD between generated and groundtruth audios with SPTK-based mcep."""
|
| 316 |
+
|
| 317 |
+
from typing import Tuple
|
| 318 |
+
|
| 319 |
+
import numpy as np
|
| 320 |
+
import pysptk
|
| 321 |
+
from fastdtw import fastdtw
|
| 322 |
+
from scipy import spatial
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def sptk_extract(
|
| 326 |
+
x: np.ndarray,
|
| 327 |
+
fs: int,
|
| 328 |
+
n_fft: int = 512,
|
| 329 |
+
n_shift: int = 256,
|
| 330 |
+
mcep_dim: int = 25,
|
| 331 |
+
mcep_alpha: float = 0.41,
|
| 332 |
+
is_padding: bool = False,
|
| 333 |
+
) -> np.ndarray:
|
| 334 |
+
"""Extract SPTK-based mel-cepstrum.
|
| 335 |
+
|
| 336 |
+
Args:
|
| 337 |
+
x (ndarray): 1D waveform array.
|
| 338 |
+
fs (int): Sampling rate
|
| 339 |
+
n_fft (int): FFT length in point (default=512).
|
| 340 |
+
n_shift (int): Shift length in point (default=256).
|
| 341 |
+
mcep_dim (int): Dimension of mel-cepstrum (default=25).
|
| 342 |
+
mcep_alpha (float): All pass filter coefficient (default=0.41).
|
| 343 |
+
is_padding (bool): Whether to pad the end of signal (default=False).
|
| 344 |
+
|
| 345 |
+
Returns:
|
| 346 |
+
ndarray: Mel-cepstrum with the size (N, n_fft).
|
| 347 |
+
|
| 348 |
+
"""
|
| 349 |
+
# perform padding
|
| 350 |
+
if is_padding:
|
| 351 |
+
n_pad = n_fft - (len(x) - n_fft) % n_shift
|
| 352 |
+
x = np.pad(x, (0, n_pad), "reflect")
|
| 353 |
+
|
| 354 |
+
# get number of frames
|
| 355 |
+
n_frame = (len(x) - n_fft) // n_shift + 1
|
| 356 |
+
|
| 357 |
+
# get window function
|
| 358 |
+
win = pysptk.sptk.hamming(n_fft)
|
| 359 |
+
|
| 360 |
+
# check mcep and alpha
|
| 361 |
+
if mcep_dim is None or mcep_alpha is None:
|
| 362 |
+
mcep_dim, mcep_alpha = _get_best_mcep_params(fs)
|
| 363 |
+
|
| 364 |
+
# calculate spectrogram
|
| 365 |
+
mcep = [
|
| 366 |
+
pysptk.mcep(
|
| 367 |
+
x[n_shift * i : n_shift * i + n_fft] * win,
|
| 368 |
+
mcep_dim,
|
| 369 |
+
mcep_alpha,
|
| 370 |
+
eps=1e-6,
|
| 371 |
+
etype=1,
|
| 372 |
+
)
|
| 373 |
+
for i in range(n_frame)
|
| 374 |
+
]
|
| 375 |
+
|
| 376 |
+
return np.stack(mcep)
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def _get_best_mcep_params(fs: int) -> Tuple[int, float]:
|
| 380 |
+
# https://sp-nitech.github.io/sptk/latest/main/mgcep.html#_CPPv4N4sptk19MelCepstralAnalysisE
|
| 381 |
+
if fs == 8000:
|
| 382 |
+
return 13, 0.31
|
| 383 |
+
elif fs == 16000:
|
| 384 |
+
return 23, 0.42
|
| 385 |
+
elif fs == 22050:
|
| 386 |
+
return 34, 0.45
|
| 387 |
+
elif fs == 24000:
|
| 388 |
+
return 34, 0.46
|
| 389 |
+
elif fs == 32000:
|
| 390 |
+
return 36, 0.50
|
| 391 |
+
elif fs == 44100:
|
| 392 |
+
return 39, 0.53
|
| 393 |
+
elif fs == 48000:
|
| 394 |
+
return 39, 0.55
|
| 395 |
+
else:
|
| 396 |
+
raise ValueError(f"Not found the setting for {fs}.")
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
def calculate_mcd(
|
| 400 |
+
inf_audio,
|
| 401 |
+
ref_audio,
|
| 402 |
+
fs,
|
| 403 |
+
n_fft=1024,
|
| 404 |
+
n_shift=256,
|
| 405 |
+
mcep_dim=None,
|
| 406 |
+
mcep_alpha=None,
|
| 407 |
+
):
|
| 408 |
+
"""Calculate MCD."""
|
| 409 |
+
|
| 410 |
+
# extract ground truth and converted features
|
| 411 |
+
gen_mcep = sptk_extract(
|
| 412 |
+
x=inf_audio,
|
| 413 |
+
fs=fs,
|
| 414 |
+
n_fft=n_fft,
|
| 415 |
+
n_shift=n_shift,
|
| 416 |
+
mcep_dim=mcep_dim,
|
| 417 |
+
mcep_alpha=mcep_alpha,
|
| 418 |
+
)
|
| 419 |
+
gt_mcep = sptk_extract(
|
| 420 |
+
x=ref_audio,
|
| 421 |
+
fs=fs,
|
| 422 |
+
n_fft=n_fft,
|
| 423 |
+
n_shift=n_shift,
|
| 424 |
+
mcep_dim=mcep_dim,
|
| 425 |
+
mcep_alpha=mcep_alpha,
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
# DTW
|
| 429 |
+
_, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean)
|
| 430 |
+
twf = np.array(path).T
|
| 431 |
+
gen_mcep_dtw = gen_mcep[twf[0]]
|
| 432 |
+
gt_mcep_dtw = gt_mcep[twf[1]]
|
| 433 |
+
|
| 434 |
+
# MCD
|
| 435 |
+
diff2sum = np.sum((gen_mcep_dtw - gt_mcep_dtw) ** 2, 1)
|
| 436 |
+
mcd = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)
|
| 437 |
+
|
| 438 |
+
return mcd
|
scores/llr.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from basis import ScoreBasis
|
| 2 |
+
import numpy as np
|
| 3 |
+
from scipy.linalg import toeplitz
|
| 4 |
+
from scores.helper import lpcoeff
|
| 5 |
+
|
| 6 |
+
class LLR(ScoreBasis):
|
| 7 |
+
def __init__(self):
|
| 8 |
+
super(LLR, self).__init__(name='LLR')
|
| 9 |
+
self.intrusive = False
|
| 10 |
+
|
| 11 |
+
def windowed_scoring(self, audios, score_rate):
|
| 12 |
+
if len(audios) != 2:
|
| 13 |
+
raise ValueError('LLR needs a reference and a test signals.')
|
| 14 |
+
return cal_LLR(audios[0], audios[1], score_rate)
|
| 15 |
+
|
| 16 |
+
def cal_LLR(ref_wav, deg_wav, srate):
|
| 17 |
+
# obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py
|
| 18 |
+
clean_speech = ref_wav
|
| 19 |
+
processed_speech = deg_wav
|
| 20 |
+
clean_length = ref_wav.shape[0]
|
| 21 |
+
processed_length = deg_wav.shape[0]
|
| 22 |
+
assert clean_length == processed_length, clean_length
|
| 23 |
+
|
| 24 |
+
winlength = round(30 * srate / 1000.) # 240 wlen in samples
|
| 25 |
+
skiprate = np.floor(winlength / 4)
|
| 26 |
+
if srate < 10000:
|
| 27 |
+
# LPC analysis order
|
| 28 |
+
P = 10
|
| 29 |
+
else:
|
| 30 |
+
P = 16
|
| 31 |
+
|
| 32 |
+
# For each frame of input speech, calculate the Log Likelihood Ratio
|
| 33 |
+
num_frames = int(clean_length / skiprate - (winlength / skiprate))
|
| 34 |
+
start = 0
|
| 35 |
+
time = np.linspace(1, winlength, winlength) / (winlength + 1)
|
| 36 |
+
window = 0.5 * (1 - np.cos(2 * np.pi * time))
|
| 37 |
+
distortion = []
|
| 38 |
+
|
| 39 |
+
for frame_count in range(num_frames):
|
| 40 |
+
# (1) Get the Frames for the test and reference speeech.
|
| 41 |
+
# Multiply by Hanning window.
|
| 42 |
+
clean_frame = clean_speech[start:start+winlength]
|
| 43 |
+
processed_frame = processed_speech[start:start+winlength]
|
| 44 |
+
clean_frame = clean_frame * window
|
| 45 |
+
processed_frame = processed_frame * window
|
| 46 |
+
|
| 47 |
+
# (2) Get the autocorrelation logs and LPC params used
|
| 48 |
+
# to compute the LLR measure
|
| 49 |
+
R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
|
| 50 |
+
R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
|
| 51 |
+
A_clean = A_clean[None, :]
|
| 52 |
+
A_processed = A_processed[None, :]
|
| 53 |
+
|
| 54 |
+
# (3) Compute the LLR measure
|
| 55 |
+
numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
|
| 56 |
+
denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
|
| 57 |
+
|
| 58 |
+
if (numerator/denominator) <= 0:
|
| 59 |
+
print(f'Numerator: {numerator}')
|
| 60 |
+
print(f'Denominator: {denominator}')
|
| 61 |
+
|
| 62 |
+
log_ = np.log(numerator / denominator)
|
| 63 |
+
distortion.append(np.squeeze(log_))
|
| 64 |
+
start += int(skiprate)
|
| 65 |
+
return np.mean(np.nan_to_num(np.array(distortion)))
|
| 66 |
+
|
scores/lsd.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from basis import ScoreBasis
|
| 2 |
+
import numpy as np
|
| 3 |
+
import librosa
|
| 4 |
+
|
| 5 |
+
EPS = 1e-12
|
| 6 |
+
|
| 7 |
+
class LSD(ScoreBasis):
|
| 8 |
+
def __init__(self):
|
| 9 |
+
super(LSD, self).__init__(name='LSD')
|
| 10 |
+
self.intrusive = False
|
| 11 |
+
self.mono = True
|
| 12 |
+
|
| 13 |
+
def windowed_scoring(self, audios, score_rate):
|
| 14 |
+
if len(audios) != 2:
|
| 15 |
+
raise ValueError('NB_PESQ needs a reference and a test signals.')
|
| 16 |
+
est = wav_to_spectrogram(audios[1], score_rate)
|
| 17 |
+
target = wav_to_spectrogram(audios[0], score_rate)
|
| 18 |
+
return cal_LSD(est, target)
|
| 19 |
+
|
| 20 |
+
def wav_to_spectrogram(wav, rate):
|
| 21 |
+
hop_length = int(rate / 100)
|
| 22 |
+
n_fft = int(2048 / (48000 / rate))
|
| 23 |
+
spec = np.abs(librosa.stft(wav, hop_length=hop_length, n_fft=n_fft))
|
| 24 |
+
spec = np.transpose(spec, (1, 0))
|
| 25 |
+
return spec
|
| 26 |
+
|
| 27 |
+
def cal_LSD(est, target):
|
| 28 |
+
log_ratio = np.log10(target**2 / ((est + EPS) ** 2) + EPS) ** 2
|
| 29 |
+
lsd_ = np.mean(np.mean(log_ratio, axis=1) ** 0.5, axis=0)
|
| 30 |
+
return lsd_
|
scores/mcd.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from basis import ScoreBasis
|
| 2 |
+
import librosa
|
| 3 |
+
import math
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pyworld
|
| 6 |
+
import pysptk
|
| 7 |
+
from fastdtw import fastdtw
|
| 8 |
+
from scipy.spatial.distance import euclidean
|
| 9 |
+
#from scores.helper import calculate_mcd
|
| 10 |
+
#from pymcd.mcd import Calculate_MCD
|
| 11 |
+
#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
|
| 12 |
+
class MCD(ScoreBasis):
|
| 13 |
+
def __init__(self):
|
| 14 |
+
super(MCD, self).__init__(name='MCD')
|
| 15 |
+
self.intrusive = False
|
| 16 |
+
# three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics
|
| 17 |
+
self.mcd_toolbox = Calculate_MCD(MCD_mode="plain")
|
| 18 |
+
|
| 19 |
+
def windowed_scoring(self, audios, score_rate):
|
| 20 |
+
if len(audios) != 2:
|
| 21 |
+
raise ValueError('MCD needs a reference and a test signals.')
|
| 22 |
+
return self.mcd_toolbox.calculate_mcd(audios[1], audios[0], score_rate)
|
| 23 |
+
|
| 24 |
+
# ================================================= #
|
| 25 |
+
# calculate the Mel-Cepstral Distortion (MCD) value #
|
| 26 |
+
# ================================================= #
|
| 27 |
+
#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
|
| 28 |
+
class Calculate_MCD(object):
|
| 29 |
+
"""docstring for Calculate_MCD"""
|
| 30 |
+
def __init__(self, MCD_mode):
|
| 31 |
+
super(Calculate_MCD, self).__init__()
|
| 32 |
+
self.MCD_mode = MCD_mode
|
| 33 |
+
#self.SAMPLING_RATE = 22050
|
| 34 |
+
self.FRAME_PERIOD = 5.0
|
| 35 |
+
self.log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) # 6.141851463713754
|
| 36 |
+
|
| 37 |
+
def load_wav(self, wav_file, sample_rate):
|
| 38 |
+
"""
|
| 39 |
+
Load a wav file with librosa.
|
| 40 |
+
:param wav_file: path to wav file
|
| 41 |
+
:param sr: sampling rate
|
| 42 |
+
:return: audio time series numpy array
|
| 43 |
+
"""
|
| 44 |
+
wav, _ = librosa.load(wav_file, sr=sample_rate, mono=True)
|
| 45 |
+
return wav
|
| 46 |
+
|
| 47 |
+
# distance metric
|
| 48 |
+
def log_spec_dB_dist(self, x, y):
|
| 49 |
+
# log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
|
| 50 |
+
diff = x - y
|
| 51 |
+
return self.log_spec_dB_const * math.sqrt(np.inner(diff, diff))
|
| 52 |
+
|
| 53 |
+
# calculate distance (metric)
|
| 54 |
+
# def calculate_mcd_distance(self, x, y, distance, path):
|
| 55 |
+
def calculate_mcd_distance(self, x, y, path):
|
| 56 |
+
'''
|
| 57 |
+
param path: pairs between x and y
|
| 58 |
+
'''
|
| 59 |
+
pathx = list(map(lambda l: l[0], path))
|
| 60 |
+
pathy = list(map(lambda l: l[1], path))
|
| 61 |
+
x, y = x[pathx], y[pathy]
|
| 62 |
+
frames_tot = x.shape[0] # length of pairs
|
| 63 |
+
|
| 64 |
+
z = x - y
|
| 65 |
+
min_cost_tot = np.sqrt((z * z).sum(-1)).sum()
|
| 66 |
+
|
| 67 |
+
return frames_tot, min_cost_tot
|
| 68 |
+
|
| 69 |
+
# extract acoustic features
|
| 70 |
+
# alpha = 0.65 # commonly used at 22050 Hz
|
| 71 |
+
def wav2mcep_numpy(self, loaded_wav, score_rate=22050, alpha=0.65, fft_size=512):
|
| 72 |
+
|
| 73 |
+
# Use WORLD vocoder to spectral envelope
|
| 74 |
+
_, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=score_rate,
|
| 75 |
+
frame_period=self.FRAME_PERIOD, fft_size=fft_size)
|
| 76 |
+
# Extract MCEP features
|
| 77 |
+
mcep = pysptk.sptk.mcep(sp, order=13, alpha=alpha, maxiter=0,
|
| 78 |
+
etype=1, eps=1.0E-8, min_det=0.0, itype=3)
|
| 79 |
+
|
| 80 |
+
return mcep
|
| 81 |
+
|
| 82 |
+
# calculate the Mel-Cepstral Distortion (MCD) value
|
| 83 |
+
#def average_mcd(self, ref_audio_file, syn_audio_file, cost_function, MCD_mode):
|
| 84 |
+
def average_mcd(self, loaded_ref_wav, loaded_syn_wav, cost_function, MCD_mode, score_rate):
|
| 85 |
+
"""
|
| 86 |
+
Calculate the average MCD.
|
| 87 |
+
:param ref_mcep_files: list of strings, paths to MCEP target reference files
|
| 88 |
+
:param synth_mcep_files: list of strings, paths to MCEP converted synthesised files
|
| 89 |
+
:param cost_function: distance metric used
|
| 90 |
+
:param plain: if plain=True, use Dynamic Time Warping (dtw)
|
| 91 |
+
:returns: average MCD, total frames processed
|
| 92 |
+
"""
|
| 93 |
+
# load wav from given wav file
|
| 94 |
+
#loaded_ref_wav = self.load_wav(ref_audio_file, sample_rate=self.SAMPLING_RATE)
|
| 95 |
+
#loaded_syn_wav = self.load_wav(syn_audio_file, sample_rate=self.SAMPLING_RATE)
|
| 96 |
+
|
| 97 |
+
if MCD_mode == "plain":
|
| 98 |
+
# pad 0
|
| 99 |
+
if len(loaded_ref_wav)<len(loaded_syn_wav):
|
| 100 |
+
loaded_ref_wav = np.pad(loaded_ref_wav, (0, len(loaded_syn_wav)-len(loaded_ref_wav)))
|
| 101 |
+
else:
|
| 102 |
+
loaded_syn_wav = np.pad(loaded_syn_wav, (0, len(loaded_ref_wav)-len(loaded_syn_wav)))
|
| 103 |
+
|
| 104 |
+
# extract MCEP features (vectors): 2D matrix (num x mcep_size)
|
| 105 |
+
ref_mcep_vec = self.wav2mcep_numpy(loaded_ref_wav, score_rate)
|
| 106 |
+
syn_mcep_vec = self.wav2mcep_numpy(loaded_syn_wav, score_rate)
|
| 107 |
+
|
| 108 |
+
if MCD_mode == "plain":
|
| 109 |
+
# print("Calculate plain MCD ...")
|
| 110 |
+
path = []
|
| 111 |
+
# for i in range(num_temp):
|
| 112 |
+
for i in range(len(ref_mcep_vec)):
|
| 113 |
+
path.append((i, i))
|
| 114 |
+
elif MCD_mode == "dtw":
|
| 115 |
+
# print("Calculate MCD-dtw ...")
|
| 116 |
+
_, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
|
| 117 |
+
elif MCD_mode == "dtw_sl":
|
| 118 |
+
# print("Calculate MCD-dtw-sl ...")
|
| 119 |
+
cof = len(ref_mcep_vec)/len(syn_mcep_vec) if len(ref_mcep_vec)>len(syn_mcep_vec) else len(syn_mcep_vec)/len(ref_mcep_vec)
|
| 120 |
+
_, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
|
| 121 |
+
|
| 122 |
+
frames_tot, min_cost_tot = self.calculate_mcd_distance(ref_mcep_vec, syn_mcep_vec, path)
|
| 123 |
+
|
| 124 |
+
if MCD_mode == "dtw_sl":
|
| 125 |
+
mean_mcd = cof * self.log_spec_dB_const * min_cost_tot / frames_tot
|
| 126 |
+
else:
|
| 127 |
+
mean_mcd = self.log_spec_dB_const * min_cost_tot / frames_tot
|
| 128 |
+
|
| 129 |
+
return mean_mcd
|
| 130 |
+
|
| 131 |
+
# calculate mcd
|
| 132 |
+
def calculate_mcd(self, reference_audio, synthesized_audio, score_rate):
|
| 133 |
+
# extract acoustic features
|
| 134 |
+
mean_mcd = self.average_mcd(reference_audio, synthesized_audio, self.log_spec_dB_dist, self.MCD_mode, score_rate)
|
| 135 |
+
|
| 136 |
+
return mean_mcd
|
scores/mosnet/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def load(window, hop=None):
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
from .model import MOSNet
|
| 4 |
+
tf.debugging.set_log_device_placement(False)
|
| 5 |
+
# set memory growth
|
| 6 |
+
gpus = tf.config.experimental.list_physical_devices('GPU')
|
| 7 |
+
if gpus:
|
| 8 |
+
try:
|
| 9 |
+
# Currently, memory growth needs to be the same across GPUs
|
| 10 |
+
for gpu in gpus:
|
| 11 |
+
tf.config.experimental.set_memory_growth(gpu, True)
|
| 12 |
+
|
| 13 |
+
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
|
| 14 |
+
print(len(gpus), "Physical GPUs,",
|
| 15 |
+
len(logical_gpus), "Logical GPUs")
|
| 16 |
+
except RuntimeError as e:
|
| 17 |
+
# Memory growth must be set before GPUs have been initialized
|
| 18 |
+
print(e)
|
| 19 |
+
|
| 20 |
+
mosnet = MOSNet(window, hop)
|
| 21 |
+
return mosnet
|
scores/mosnet/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (789 Bytes). View file
|
|
|
scores/mosnet/cnn_blstm.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:78b75e7d76ee6074ea7d57dcffa56d0c90be9d3d8dedc2217e25e259423cb756
|
| 3 |
+
size 14248464
|