diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c09814d9486ac50e9422bb4d0f4176ccb194fd03
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,169 @@
+class Metric:
+    def __init__(self, name, window, hop=None, verbose=False):
+        # the metric operates on some fixed rate only or only on mono ?
+        self.fixed_rate = None
+        self.mono = False
+
+        # is the metric absolute or relative ?
+        self.absolute = False
+
+        # length and hop of windows
+        self.window = window
+        if hop is None:
+            hop = window
+        self.hop = hop
+        self.name = name
+        self.verbose = verbose
+
+    def test_window(self, audios, rate):
+        raise NotImplementedError
+
+    def test(self, *test_files, array_rate=None):
+        """loading sound files and making sure they all have the same lengths
+        (zero-padding to the largest). Also works with numpy arrays.
+        Then, calling the `test_window` function that should be specialised
+        depending on the metric."""
+
+        # imports
+        import soundfile as sf
+        import resampy
+        from museval.metrics import Framing
+        import numpy as np
+
+        audios = []
+        maxlen = 0
+        if isinstance(test_files, str):
+            test_files = [test_files]
+        if self.absolute and len(test_files) > 1:
+            if self.verbose:
+                print('  [%s] is absolute. Processing first file only'
+                      % self.name)
+            test_files = [test_files[0],]
+
+        for file in test_files:
+            # Loading sound file
+            if isinstance(file, str):
+                audio, rate = sf.read(file, always_2d=True)
+            else:
+                rate = array_rate
+                if rate is None:
+                    raise ValueError('Sampling rate needs to be specified '
+                                     'when feeding numpy arrays.')
+                audio = file
+                # Standardize shapes
+                if len(audio.shape) == 1:
+                    audio = audio[:, None]
+                if len(audio.shape) != 2:
+                    raise ValueError('Please provide 1D or 2D array, received '
+                                     '{}D array'.format(len(audio.shape)))
+
+            if self.fixed_rate is not None and rate != self.fixed_rate:
+                if self.verbose:
+                    print('  [%s] preferred is %dkHz rate. resampling'
+                          % (self.name, self.fixed_rate))
+                audio = resampy.resample(audio, rate, self.fixed_rate, axis=0)
+                rate = self.fixed_rate
+            if self.mono and audio.shape[1] > 1:
+                if self.verbose:
+                    print('  [%s] only supports mono. Will use first channel'
+                          % self.name)
+                audio = audio[..., 0, None]
+            if self.mono:
+                audio = audio[..., 0]
+            maxlen = max(maxlen, audio.shape[0])
+            audios += [audio]
+
+        for index, audio in enumerate(audios):
+            if audio.shape[0] != maxlen:
+                new = np.zeros((maxlen,) + audio.shape[1:])
+                new[:audio.shape[0]] = audio
+                audios[index] = new
+
+        if self.window is not None:
+            framer = Framing(self.window * rate,
+                             self.hop * rate, maxlen)
+            nwin = framer.nwin
+            result = {}
+            for (t, win) in enumerate(framer):
+                result_t = self.test_window([audio[win] for audio in audios],
+                                            rate)
+                for metric in result_t.keys():
+                    if metric not in result.keys():
+                        result[metric] = np.empty(nwin)
+                    result[metric][t] = result_t[metric]
+        else:
+            result = self.test_window(audios, rate)
+        return result
+
+
+import absolute
+import relative
+
+
+class MetricsList:
+    def __init__(self):
+        self.metrics = []
+
+    def __add__(self, metric):
+        self.metrics += [metric]
+        return self
+
+    def __str__(self):
+        return 'Metrics: ' + ' '.join([x.name for x in self.metrics])
+
+    def __call__(self, *files, rate=None):
+        result = {}
+        for metric in self.metrics:
+            result_metric = metric.test(*files, array_rate=rate)
+            for name in result_metric.keys():
+                result[name] = result_metric[name]
+        return result
+
+
+def load(metrics='', window=2, verbose=False):
+    """ Load the desired metrics inside a Metrics object that can then
+    be called to compute all the desired metrics.
+
+    Parameters:
+    ----------
+    metrics: str or list of str
+        the metrics matching any of these will be automatically loaded. this
+        match is relative to the structure of the speechmetrics package.
+        For instance:
+        * 'absolute' will match all absolute metrics
+        * 'absolute.srmr' or 'srmr' will only match SRMR
+        * '' will match all
+
+    window: float
+        the window length to use for testing the files.
+
+    verbose: boolean
+        will display information during computations
+
+    Returns:
+    --------
+
+    A MetricsList object, that can be run to get the desired metrics
+    """
+    import pkgutil
+    import importlib
+
+    result = MetricsList()
+
+    found_modules = []
+    iterator = pkgutil.walk_packages(__path__, __name__ + '.')
+
+    if isinstance(metrics, str):
+        metrics = [metrics]
+    for module_info in iterator:
+        if any([metric in module_info.name for metric in metrics]):
+            module = importlib.import_module(module_info.name)
+            if module not in found_modules:
+                found_modules += [module],
+                if hasattr(module, 'load'):
+                    load_function = getattr(module, 'load')
+                    new_metric = load_function(window)
+                    new_metric.verbose = verbose
+                    result += new_metric
+                    print('Loaded ', module_info.name)
+    return result
diff --git a/__pycache__/basis.cpython-38.pyc b/__pycache__/basis.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea7bc745141515177444e76c03388aa4c850d553
Binary files /dev/null and b/__pycache__/basis.cpython-38.pyc differ
diff --git a/__pycache__/metric_loader.cpython-38.pyc b/__pycache__/metric_loader.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d05fc1493e0cd66de4ef52326a9cee7fc4292331
Binary files /dev/null and b/__pycache__/metric_loader.cpython-38.pyc differ
diff --git a/__pycache__/metrics.cpython-38.pyc b/__pycache__/metrics.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a7b0056b5bdaf4ebfd941a7370c59975720acfa
Binary files /dev/null and b/__pycache__/metrics.cpython-38.pyc differ
diff --git a/__pycache__/speechscore.cpython-38.pyc b/__pycache__/speechscore.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae61aff9bd36c3c8424db7db13f9c44291978a59
Binary files /dev/null and b/__pycache__/speechscore.cpython-38.pyc differ
diff --git a/audios/clean/audio_1.wav b/audios/clean/audio_1.wav
new file mode 100644
index 0000000000000000000000000000000000000000..acd41d940ef722dc3d7c5356acd0ab6a1c17aafa
Binary files /dev/null and b/audios/clean/audio_1.wav differ
diff --git a/audios/clean/audio_2.wav b/audios/clean/audio_2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..7772afa2ef5f6881d5b16cf0c301fc015b88c7b7
Binary files /dev/null and b/audios/clean/audio_2.wav differ
diff --git a/audios/noisy/audio_1.wav b/audios/noisy/audio_1.wav
new file mode 100644
index 0000000000000000000000000000000000000000..425d7c04381e56f0b78947743cfa35ca0d99ac17
Binary files /dev/null and b/audios/noisy/audio_1.wav differ
diff --git a/audios/noisy/audio_2.wav b/audios/noisy/audio_2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..e7d44822410a3a64ddf0d6283787b232679190bf
Binary files /dev/null and b/audios/noisy/audio_2.wav differ
diff --git a/audios/ref.wav b/audios/ref.wav
new file mode 100644
index 0000000000000000000000000000000000000000..acd41d940ef722dc3d7c5356acd0ab6a1c17aafa
Binary files /dev/null and b/audios/ref.wav differ
diff --git a/audios/test.wav b/audios/test.wav
new file mode 100644
index 0000000000000000000000000000000000000000..425d7c04381e56f0b78947743cfa35ca0d99ac17
Binary files /dev/null and b/audios/test.wav differ
diff --git a/basis.py b/basis.py
new file mode 100644
index 0000000000000000000000000000000000000000..b52cec0c51765ce9d4d30a67f00879ee452ab2d0
--- /dev/null
+++ b/basis.py
@@ -0,0 +1,113 @@
+class ScoreBasis:
+    def __init__(self, name=None):
+        # the score operates on the specified rate
+        self.score_rate = None
+        # is the score intrusive or non-intrusive ?
+        self.intrusive = True #require a reference
+        self.name = name
+
+    def windowed_scoring(self, audios, score_rate):
+        raise NotImplementedError(f'In {self.name}, windowed_scoring is not yet implemented')
+
+    def scoring(self, data, window=None, score_rate=None):
+        """ calling the `windowed_scoring` function that should be specialised
+        depending on the score."""
+
+        # imports
+        #import soundfile as sf
+        import resampy
+        from museval.metrics import Framing
+
+        #checking rate
+        audios = data['audio']
+        score_rate = data['rate']
+
+        if self.score_rate is not None:
+            score_rate = self.score_rate
+
+        if score_rate != data['rate']:
+            for index, audio in enumerate(audios):
+                audio = resampy.resample(audio, data['rate'], score_rate, axis=0)
+                audios[index] = audio
+
+        if window is not None:
+            framer = Framing(window * score_rate, window * score_rate, maxlen)
+            nwin = framer.nwin
+            result = {}
+            for (t, win) in enumerate(framer):
+                result_t = self.windowed_scoring([audio[win] for audio in audios], score_rate)
+                result[t] = result_t
+        else:
+            result = self.windowed_scoring(audios, score_rate)
+        return result
+        """
+        audios = []
+        maxlen = 0
+        if isinstance(test_files, str):
+            test_files = [test_files]
+        print(f'test_files: {test_files}')
+        if not self.intrusive and len(test_files) > 1:
+            if self.verbose:
+                print('  [%s] is non-intrusive. Processing first file only'
+                      % self.name)
+            test_files = [test_files[0],]
+        for file in test_files:
+            # Loading sound file
+            if isinstance(file, str):
+                audio, rate = sf.read(file, always_2d=True)
+            else:
+                rate = array_rate
+                if rate is None:
+                    raise ValueError('Sampling rate needs to be specified '
+                                     'when feeding numpy arrays.')
+                audio = file
+                # Standardize shapes
+                if len(audio.shape) == 1:
+                    audio = audio[:, None]
+                if len(audio.shape) != 2:
+                    raise ValueError('Please provide 1D or 2D array, received '
+                                     '{}D array'.format(len(audio.shape)))
+
+            if self.fixed_rate is not None and rate != self.fixed_rate:
+                if self.verbose:
+                    print('  [%s] preferred is %dkHz rate. resampling'
+                          % (self.name, self.fixed_rate))
+                audio = resampy.resample(audio, rate, self.fixed_rate, axis=0)
+                rate = self.fixed_rate
+            if self.mono and audio.shape[1] > 1:
+                if self.verbose:
+                    print('  [%s] only supports mono. Will use first channel'
+                          % self.name)
+                audio = audio[..., 0, None]
+            if self.mono:
+                audio = audio[..., 0]
+            maxlen = max(maxlen, audio.shape[0])
+            audios += [audio]
+        audio = audios[1]
+        audio[:maxlen-320] = audio[320:]
+        audios[1] = audio
+        for index, audio in enumerate(audios):
+            if audio.shape[0] != maxlen:
+                new = np.zeros((maxlen,) + audio.shape[1:])
+                new[:audio.shape[0]] = audio
+                audios[index] = new
+
+        if self.window is not None:
+            framer = Framing(self.window * rate,
+                             self.hop * rate, maxlen)
+            nwin = framer.nwin
+            result = {}
+            for (t, win) in enumerate(framer):
+                result_t = self.test_window([audio[win] for audio in audios],
+                                            rate)
+                #or metric in result_t.keys():
+                #   if metric not in result.keys():
+                #       result[metric] = np.empty(nwin)
+                #   result[metric][t] = result_t[metric]
+                result[t] = result_t
+        else:
+            result = self.test_window(audios, rate)
+        return result
+        """
+
+
diff --git a/demo.py b/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..86bd4c124f4e373fb003cc65a7a8e228360eab47
--- /dev/null
+++ b/demo.py
@@ -0,0 +1,29 @@
+# Import pprint for pretty-printing the results in a more readable format
+import pprint
+# Import the SpeechScore class to evaluate speech quality metrics
+from speechscore import SpeechScore 
+
+# Main block to ensure the code runs only when executed directly
+if __name__ == '__main__':
+    # Initialize a SpeechScore object with a list of score metrics to be evaluated
+    # Supports any subsets of the list
+    mySpeechScore = SpeechScore([
+        'SRMR', 'PESQ', 'NB_PESQ', 'STOI', 'SISDR', 
+        'FWSEGSNR', 'LSD', 'BSSEval', 'DNSMOS', 
+        'SNR', 'SSNR', 'LLR', 'CSIG', 'CBAK', 
+        'COVL', 'MCD'
+    ])
+
+    # Call the SpeechScore object to evaluate the speech metrics between 'noisy' and 'clean' audio
+    # Arguments:
+    # - {test_path, reference_path} supports audio directories or audio paths (.wav or .flac)
+    # - window (float): seconds, set None to specify no windowing (process the full audio)
+    # - score_rate (int): specifies the sampling rate at which the metrics should be computed
+    # - return_mean (bool): set True to specify that the mean score for each metric should be returned
+    scores = mySpeechScore(test_path='audios/noisy/', reference_path='audios/clean/', window=None, score_rate=16000, return_mean=True)
+
+    # Pretty-print the resulting scores in a readable format
+    pprint.pprint(scores)
+
+    # Print only the resulting mean scores in a readable format
+    pprint.pprint(scores['Mean_Score'])
diff --git a/requirement.txt b/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2f1049727880c941f324adb2b51fecea317ea44a
--- /dev/null
+++ b/requirement.txt
@@ -0,0 +1,5 @@
+pysptk
+pymcd
+pyworld
+fastdtw
+museval
diff --git a/scores/__init__.py b/scores/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scores/__pycache__/__init__.cpython-38.pyc b/scores/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c9133603b894bbefa859de058c1b5c2edd399db
Binary files /dev/null and b/scores/__pycache__/__init__.cpython-38.pyc differ
diff --git a/scores/__pycache__/bsseval.cpython-38.pyc b/scores/__pycache__/bsseval.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8a68fc432e2d3ca8c2775cb0e7e71bb602da8f4
Binary files /dev/null and b/scores/__pycache__/bsseval.cpython-38.pyc differ
diff --git a/scores/__pycache__/cbak.cpython-38.pyc b/scores/__pycache__/cbak.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25e8db7f234326faa8c7d70a329b8bb9544aef43
Binary files /dev/null and b/scores/__pycache__/cbak.cpython-38.pyc differ
diff --git a/scores/__pycache__/covl.cpython-38.pyc b/scores/__pycache__/covl.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c453f99db584c7c3cfa39ca30168f83863d57cff
Binary files /dev/null and b/scores/__pycache__/covl.cpython-38.pyc differ
diff --git a/scores/__pycache__/csig.cpython-38.pyc b/scores/__pycache__/csig.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21f3655bb7c0948f1bbd3c97835abb146991666d
Binary files /dev/null and b/scores/__pycache__/csig.cpython-38.pyc differ
diff --git a/scores/__pycache__/fwsegsnr.cpython-38.pyc b/scores/__pycache__/fwsegsnr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1daa9da0bc041ad9dff863d48f34c75a7869dffe
Binary files /dev/null and b/scores/__pycache__/fwsegsnr.cpython-38.pyc differ
diff --git a/scores/__pycache__/helper.cpython-38.pyc b/scores/__pycache__/helper.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2742d24b9aa601045c1410087df9d98b9a48f612
Binary files /dev/null and b/scores/__pycache__/helper.cpython-38.pyc differ
diff --git a/scores/__pycache__/llr.cpython-38.pyc b/scores/__pycache__/llr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44a56daff14f3b3614771ec53b015204b06af7a9
Binary files /dev/null and b/scores/__pycache__/llr.cpython-38.pyc differ
diff --git a/scores/__pycache__/lsd.cpython-38.pyc b/scores/__pycache__/lsd.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58d2614bdd343876ec59c5225e2ca6ba92e82d46
Binary files /dev/null and b/scores/__pycache__/lsd.cpython-38.pyc differ
diff --git a/scores/__pycache__/mcd.cpython-38.pyc b/scores/__pycache__/mcd.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3a37c67f4a9d42741c0e66214d23b8384089c07
Binary files /dev/null and b/scores/__pycache__/mcd.cpython-38.pyc differ
diff --git a/scores/__pycache__/nb_pesq.cpython-38.pyc b/scores/__pycache__/nb_pesq.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4ba23b8074f96a87b7c1c643a4029c8c1ef0536
Binary files /dev/null and b/scores/__pycache__/nb_pesq.cpython-38.pyc differ
diff --git a/scores/__pycache__/pesq.cpython-38.pyc b/scores/__pycache__/pesq.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6cac5401940decc3c733b74ecfc5d0e43b714aa
Binary files /dev/null and b/scores/__pycache__/pesq.cpython-38.pyc differ
diff --git a/scores/__pycache__/sisdr.cpython-38.pyc b/scores/__pycache__/sisdr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5366ecb19c849097122585cdf73c28cd2938c5c
Binary files /dev/null and b/scores/__pycache__/sisdr.cpython-38.pyc differ
diff --git a/scores/__pycache__/snr.cpython-38.pyc b/scores/__pycache__/snr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..531ac2aaad7fa43d9285679795df1a6176666fc6
Binary files /dev/null and b/scores/__pycache__/snr.cpython-38.pyc differ
diff --git a/scores/__pycache__/ssnr.cpython-38.pyc b/scores/__pycache__/ssnr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a95e66a13bb833b15bda4bdafa8aa7fb7e93f3e2
Binary files /dev/null and b/scores/__pycache__/ssnr.cpython-38.pyc differ
diff --git a/scores/__pycache__/stoi.cpython-38.pyc b/scores/__pycache__/stoi.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..175f433d17106d06c8463e38746a5922b8d5521c
Binary files /dev/null and b/scores/__pycache__/stoi.cpython-38.pyc differ
diff --git a/scores/bsseval.py b/scores/bsseval.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4af1f0456ce49041147bda11c538a08eddb8ff4
--- /dev/null
+++ b/scores/bsseval.py
@@ -0,0 +1,21 @@
+import numpy as np
+from basis import ScoreBasis
+
+
+class BSSEval(ScoreBasis):
+    def __init__(self):
+        super(BSSEval, self).__init__(name='BSSEval')
+        self.intrusive = False
+
+    def windowed_scoring(self, audios, score_rate):
+        bss_window = np.inf
+        bss_hop = np.inf
+        from museval.metrics import bss_eval
+        if len(audios) != 2:
+            raise ValueError('BSSEval needs a reference and a test signals.')
+        
+        result = bss_eval(reference_sources=audios[1][None,...], # shape: [nsrc, nsample, nchannels]
+                        estimated_sources=audios[0][None,...],
+                        window=bss_window * score_rate,
+                        hop=bss_hop * score_rate)
+        return {'SDR': result[0][0][0], 'ISR': result[1][0][0], 'SAR': result[3][0][0]}
diff --git a/scores/cbak.py b/scores/cbak.py
new file mode 100644
index 0000000000000000000000000000000000000000..f50f125176881a2b2a196ac584d11741c0cd5233
--- /dev/null
+++ b/scores/cbak.py
@@ -0,0 +1,37 @@
+from basis import ScoreBasis
+import numpy as np
+from pesq import pesq
+from scores.helper import wss, llr, SSNR, trim_mos
+
+class CBAK(ScoreBasis):
+    def __init__(self):
+        super(CBAK, self).__init__(name='CBAK')
+        self.score_rate = 16000
+        self.intrusive = False
+
+    def windowed_scoring(self, audios, score_rate):
+        if len(audios) != 2:
+            raise ValueError('CBAK needs a reference and a test signals.')
+        return cal_CBAK(audios[0], audios[1], score_rate)
+
+def cal_CBAK(target_wav, pred_wav, fs):
+    alpha   = 0.95
+
+    # Compute WSS measure
+    wss_dist_vec = wss(target_wav, pred_wav, fs)
+    wss_dist_vec = sorted(wss_dist_vec, reverse=False)
+    wss_dist     = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
+
+    # Compute the SSNR
+    snr_mean, segsnr_mean = SSNR(target_wav, pred_wav, fs)
+    segSNR = np.mean(segsnr_mean)
+
+    # Compute the PESQ
+    pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
+
+    # Cbak
+    Cbak = 1.634 + 0.478 * pesq_raw - 0.007 * wss_dist + 0.063 * segSNR
+    Cbak = trim_mos(Cbak)
+
+    return Cbak
+
diff --git a/scores/covl.py b/scores/covl.py
new file mode 100644
index 0000000000000000000000000000000000000000..abffb21a041cbb5deec966076da856d58128a15d
--- /dev/null
+++ b/scores/covl.py
@@ -0,0 +1,39 @@
+from basis import ScoreBasis
+import numpy as np
+from pesq import pesq
+from scores.helper import wss, llr, SSNR, trim_mos
+
+class COVL(ScoreBasis):
+    def __init__(self):
+        super(COVL, self).__init__(name='COVL')
+        self.score_rate = 16000
+        self.intrusive = False
+
+    def windowed_scoring(self, audios, score_rate):
+        if len(audios) != 2:
+            raise ValueError('COVL needs a reference and a test signals.')
+        return cal_COVL(audios[0], audios[1], score_rate)
+
+def cal_COVL(target_wav, pred_wav, fs):
+    alpha   = 0.95
+
+    # Compute WSS measure
+    wss_dist_vec = wss(target_wav, pred_wav, fs)
+    wss_dist_vec = sorted(wss_dist_vec, reverse=False)
+    wss_dist     = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
+
+    # Compute LLR measure
+    LLR_dist = llr(target_wav, pred_wav, fs)
+    LLR_dist = sorted(LLR_dist, reverse=False)
+    LLRs     = LLR_dist
+    LLR_len  = round(len(LLR_dist) * alpha)
+    llr_mean = np.mean(LLRs[:LLR_len])
+
+    # Compute the PESQ
+    pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
+
+    # Covl
+    Covl = 1.594 + 0.805 * pesq_raw - 0.512 * llr_mean - 0.007 * wss_dist
+    Covl = trim_mos(Covl)
+
+    return Covl
diff --git a/scores/csig.py b/scores/csig.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc5579aefd7a40a641f53a539eaddb46f553794
--- /dev/null
+++ b/scores/csig.py
@@ -0,0 +1,38 @@
+from basis import ScoreBasis
+import numpy as np
+from pesq import pesq
+from scores.helper import wss, llr, SSNR, trim_mos
+
+class CSIG(ScoreBasis):
+    def __init__(self):
+        super(CSIG, self).__init__(name='CSIG')
+        self.score_rate = 16000
+
+    def windowed_scoring(self, audios, score_rate):
+        if len(audios) != 2:
+            raise ValueError('CSIG needs a reference and a test signals.')
+        return cal_CSIG(audios[0], audios[1], score_rate)
+
+def cal_CSIG(target_wav, pred_wav, fs):
+    alpha   = 0.95
+
+    # Compute WSS measure
+    wss_dist_vec = wss(target_wav, pred_wav, fs)
+    wss_dist_vec = sorted(wss_dist_vec, reverse=False)
+    wss_dist     = np.mean(wss_dist_vec[:int(round(len(wss_dist_vec) * alpha))])
+
+    # Compute LLR measure
+    LLR_dist = llr(target_wav, pred_wav, fs)
+    LLR_dist = sorted(LLR_dist, reverse=False)
+    LLRs     = LLR_dist
+    LLR_len  = round(len(LLR_dist) * alpha)
+    llr_mean = np.mean(LLRs[:LLR_len])
+
+    # Compute the PESQ
+    pesq_raw = pesq(fs, target_wav, pred_wav, 'wb')
+
+    # Csig
+    Csig = 3.093 - 1.029 * llr_mean + 0.603 * pesq_raw - 0.009 * wss_dist
+    Csig = float(trim_mos(Csig))
+    
+    return Csig
diff --git a/scores/dnsmos/DNSMOS/bak_ovr.onnx b/scores/dnsmos/DNSMOS/bak_ovr.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..7c3f8e41619d513ae3d9983729841509eebe4cbc
--- /dev/null
+++ b/scores/dnsmos/DNSMOS/bak_ovr.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f335c90994618150192a656a474bcf8a9cbcedbc47965494ba8da79605d1308
+size 742375
diff --git a/scores/dnsmos/DNSMOS/model_v8.onnx b/scores/dnsmos/DNSMOS/model_v8.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..0e04b14824c4dfc6af9d62040c92c09da56f21e7
--- /dev/null
+++ b/scores/dnsmos/DNSMOS/model_v8.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9246480c58567bc6affd4200938e77eef49468c8bc7ed3776d109c07456f6e91
+size 224860
diff --git a/scores/dnsmos/DNSMOS/sig.onnx b/scores/dnsmos/DNSMOS/sig.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..eaa69f859d01dcbe66fe6a02f9b696f312f971e3
--- /dev/null
+++ b/scores/dnsmos/DNSMOS/sig.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2fbdb293bc2366dfbae2b7477c490f981d24a8b4405efd3c11787569c6549d7
+size 742203
diff --git a/scores/dnsmos/DNSMOS/sig_bak_ovr.onnx b/scores/dnsmos/DNSMOS/sig_bak_ovr.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..81f885c678aefcf76de1f00fbc80167aa1ca1d96
--- /dev/null
+++ b/scores/dnsmos/DNSMOS/sig_bak_ovr.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:269fbebdb513aa23cddfbb593542ecc540284a91849ac50516870e1ac78f6edd
+size 1157965
diff --git a/scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc b/scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7687934282d311409dfe34c0db9bbcb64244aa98
Binary files /dev/null and b/scores/dnsmos/__pycache__/dnsmos.cpython-38.pyc differ
diff --git a/scores/dnsmos/dnsmos.py b/scores/dnsmos/dnsmos.py
new file mode 100644
index 0000000000000000000000000000000000000000..100b0f813c17078c356db8271438539c8ea89054
--- /dev/null
+++ b/scores/dnsmos/dnsmos.py
@@ -0,0 +1,94 @@
+import os
+
+import librosa
+import numpy as np
+import numpy.polynomial.polynomial as poly
+import onnxruntime as ort
+import soundfile as sf
+
+SAMPLING_RATE = 16000
+INPUT_LENGTH = 9.01
+
+from basis import ScoreBasis
+
+
+class DNSMOS(ScoreBasis):
+    def __init__(self):
+        super(DNSMOS, self).__init__(name='DNSMOS')
+        self.intrusive = True
+        self.score_rate = 16000
+        self.p808_model_path = os.path.join('scores/dnsmos/DNSMOS', 'model_v8.onnx')    
+        self.primary_model_path = os.path.join('scores/dnsmos/DNSMOS', 'sig_bak_ovr.onnx')
+        self.compute_score = ComputeScore(self.primary_model_path, self.p808_model_path)
+
+    def windowed_scoring(self, audios, rate):
+        if len(audios) == 2:
+            return self.compute_score.cal_mos(audios[1], rate)
+        else:
+            return self.compute_score.cal_mos(audios[0], rate)
+
+class ComputeScore:
+    def __init__(self, primary_model_path, p808_model_path) -> None:
+        self.onnx_sess = ort.InferenceSession(primary_model_path)
+        self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
+        
+    def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
+        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels)
+        if to_db:
+            mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40
+        return mel_spec.T
+
+    def get_polyfit_val(self, sig, bak, ovr):
+        p_ovr = np.poly1d([-0.06766283,  1.11546468,  0.04602535])
+        p_sig = np.poly1d([-0.08397278,  1.22083953,  0.0052439 ])
+        p_bak = np.poly1d([-0.13166888,  1.60915514, -0.39604546])
+
+        sig_poly = p_sig(sig)
+        bak_poly = p_bak(bak)
+        ovr_poly = p_ovr(ovr)
+
+        return sig_poly, bak_poly, ovr_poly
+
+    def cal_mos(self, audio, sampling_rate):
+        fs = sampling_rate
+        actual_audio_len = len(audio)
+        len_samples = int(INPUT_LENGTH*fs)
+        while len(audio) < len_samples:
+            audio = np.append(audio, audio)
+        
+        num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1
+        hop_len_samples = fs
+        predicted_mos_sig_seg_raw = []
+        predicted_mos_bak_seg_raw = []
+        predicted_mos_ovr_seg_raw = []
+        predicted_mos_sig_seg = []
+        predicted_mos_bak_seg = []
+        predicted_mos_ovr_seg = []
+        predicted_p808_mos = []
+
+        for idx in range(num_hops):
+            audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)]
+            if len(audio_seg) < len_samples:
+                continue
+
+            input_features = np.array(audio_seg).astype('float32')[np.newaxis,:]
+            p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :]
+            oi = {'input_1': input_features}
+            p808_oi = {'input_1': p808_input_features}
+            p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
+            mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
+            mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw,mos_bak_raw,mos_ovr_raw)
+            predicted_mos_sig_seg_raw.append(mos_sig_raw)
+            predicted_mos_bak_seg_raw.append(mos_bak_raw)
+            predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
+            predicted_mos_sig_seg.append(mos_sig)
+            predicted_mos_bak_seg.append(mos_bak)
+            predicted_mos_ovr_seg.append(mos_ovr)
+            predicted_p808_mos.append(p808_mos)
+
+        results = {}
+        results['OVRL'] = np.mean(predicted_mos_ovr_seg)
+        results['SIG'] = np.mean(predicted_mos_sig_seg)
+        results['BAK'] = np.mean(predicted_mos_bak_seg)
+        results['P808_MOS'] = np.mean(predicted_p808_mos)
+        return results
diff --git a/scores/fwsegsnr.py b/scores/fwsegsnr.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae41fa1de9ced547e1f79c803cd9daa1962d07d2
--- /dev/null
+++ b/scores/fwsegsnr.py
@@ -0,0 +1,49 @@
+import librosa
+import numpy as np
+from basis import ScoreBasis
+
+class FWSEGSNR(ScoreBasis):
+    def __init__(self):
+        super(FWSEGSNR, self).__init__(name='FWSEGSNR')
+        self.intrusive = False
+
+    def windowed_scoring(self, audios, score_rate):
+        if len(audios) != 2:
+            raise ValueError('FWSEGSNR needs a reference and a test signals.')
+        return fwsegsnr(audios[1], audios[0], score_rate)
+
+def fwsegsnr(x, y, fs, frame_sz = 0.025, shift_sz= 0.01, win='hann', numband=23):
+    epsilon = np.finfo(np.float32).eps
+    frame = int(np.fix(frame_sz * fs))
+    shift = int(np.fix(shift_sz * fs))
+    window = win
+    nband = numband
+    noverlap = frame - shift
+    fftpt = int(2**np.ceil(np.log2(np.abs(frame))))
+    x = x / np.sqrt(sum(np.power(x, 2)))
+    y = y / np.sqrt(sum(np.power(y, 2)))
+
+    assert len(x) == len(y), print('Wav length are not matched!')
+    X_stft = np.abs(librosa.stft(x, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False))
+    Y_stft = np.abs(librosa.stft(y, n_fft=fftpt, hop_length=shift, win_length=frame, window=window, center=False))
+
+    num_freq = X_stft.shape[0]
+    num_frame = X_stft.shape[1]
+
+    X_mel = librosa.feature.melspectrogram(S=X_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2)
+    Y_mel = librosa.feature.melspectrogram(S=Y_stft, sr=fs, n_mels=nband, fmin=0, fmax=fs/2)
+
+    # Calculate SNR.
+
+    W = np.power(Y_mel, 0.2)
+    E = X_mel - Y_mel
+    E[E == 0.0] = epsilon
+    E_power = np.power(E, 2)
+    Y_div_E = np.divide((np.power(Y_mel,2)), (np.power(E,2)))
+    Y_div_E[Y_div_E==0] = epsilon
+    ds = 10 * np.divide(np.sum(np.multiply(W, np.log10(Y_div_E)), 1), np.sum(W, 1))
+    ds[ds > 35] = 35
+    ds[ds < -10] = -10
+    d = np.mean(ds)
+    return d
+
diff --git a/scores/helper.py b/scores/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34ba2be06d21b34aff666be23d3c78493e8b764
--- /dev/null
+++ b/scores/helper.py
@@ -0,0 +1,307 @@
+"""
+Modifications in Metrics
+
+# Original copyright:
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez
+"""
+import numpy as np
+from scipy.linalg import toeplitz
+
+# ----------------------------- HELPERS ------------------------------------ #
+def trim_mos(val):
+    return min(max(val, 1), 5)
+
+def lpcoeff(speech_frame, model_order):
+    # (1) Compute Autocor lags
+    winlength = speech_frame.shape[0]
+    R = []
+    for k in range(model_order + 1):
+        first  = speech_frame[:(winlength - k)]
+        second = speech_frame[k:winlength]
+        R.append(np.sum(first * second))
+
+    # (2) Lev-Durbin
+    a = np.ones((model_order,))
+    E = np.zeros((model_order + 1,))
+    rcoeff = np.zeros((model_order,))
+    E[0] = R[0]
+    for i in range(model_order):
+        if i == 0:
+            sum_term = 0
+        else:
+            a_past = a[:i]
+            sum_term = np.sum(a_past * np.array(R[i:0:-1]))
+        rcoeff[i] = (R[i+1] - sum_term)/E[i]
+        a[i] = rcoeff[i]
+        if i > 0:
+            a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1]
+        E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i]
+    acorr    = np.array(R, dtype=np.float32)
+    refcoeff = np.array(rcoeff, dtype=np.float32)
+    a = a * -1
+    lpparams = np.array([1] + list(a), dtype=np.float32)
+    acorr    = np.array(acorr, dtype=np.float32)
+    refcoeff = np.array(refcoeff, dtype=np.float32)
+    lpparams = np.array(lpparams, dtype=np.float32)
+
+    return acorr, refcoeff, lpparams
+# -------------------------------------------------------------------------- #
+
+
+def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
+    """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
+        This function implements the segmental signal-to-noise ratio
+        as defined in [1, p. 45] (see Equation 2.12).
+    """
+    clean_speech     = ref_wav
+    processed_speech = deg_wav
+    clean_length     = ref_wav.shape[0]
+    processed_length = deg_wav.shape[0]
+    
+    # scale both to have same dynamic range. Remove DC too.
+    clean_speech     -= clean_speech.mean()
+    processed_speech -= processed_speech.mean()
+    processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
+   
+    # Signal-to-Noise Ratio 
+    dif = ref_wav - deg_wav
+    overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) +
+                                                        10e-20))
+    # global variables
+    winlength = int(np.round(30 * srate / 1000)) # 30 msecs
+    skiprate  = winlength // 4
+    MIN_SNR   = -10
+    MAX_SNR   = 35
+
+    # For each frame, calculate SSNR
+    num_frames    = int(clean_length / skiprate - (winlength/skiprate))
+    start         = 0
+    time          = np.linspace(1, winlength, winlength) / (winlength + 1)
+    window        = 0.5 * (1 - np.cos(2 * np.pi * time))
+    segmental_snr = []
+
+    for frame_count in range(int(num_frames)):
+        # (1) get the frames for the test and ref speech.
+        # Apply Hanning Window
+        clean_frame     = clean_speech[start:start+winlength]
+        processed_frame = processed_speech[start:start+winlength]
+        clean_frame     = clean_frame * window
+        processed_frame = processed_frame * window
+
+        # (2) Compute Segmental SNR
+        signal_energy = np.sum(clean_frame ** 2)
+        noise_energy  = np.sum((clean_frame - processed_frame) ** 2)
+        segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
+        segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
+        segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
+        start += int(skiprate)
+    return overall_snr, segmental_snr
+
+
+def wss(ref_wav, deg_wav, srate):
+    clean_speech     = ref_wav
+    processed_speech = deg_wav
+    clean_length     = ref_wav.shape[0]
+    processed_length = deg_wav.shape[0]
+
+    assert clean_length == processed_length, clean_length
+
+    winlength = round(30 * srate / 1000.) # 240 wlen in samples
+    skiprate  = np.floor(winlength / 4)
+    max_freq  = srate / 2
+    num_crit  = 25 # num of critical bands
+
+    USE_FFT_SPECTRUM = 1
+    n_fft    = int(2 ** np.ceil(np.log(2*winlength)/np.log(2)))
+    n_fftby2 = int(n_fft / 2)
+    Kmax     = 20
+    Klocmax  = 1
+
+    # Critical band filter definitions (Center frequency and BW in Hz)
+    cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372,
+                 703.378, 798.717, 904.128, 1020.38, 1148.30, 
+                 1288.72, 1442.54, 1610.70, 1794.16, 1993.93, 
+                 2211.08, 2446.71, 2701.97, 2978.04, 3276.17,
+                 3597.63]
+    bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056,
+                 95.3398, 105.411, 116.256, 127.914, 140.423, 
+                 153.823, 168.154, 183.457, 199.776, 217.153, 
+                 235.631, 255.255, 276.072, 298.126, 321.465,
+                 346.136]
+
+    bw_min = bandwidth[0] # min critical bandwidth
+
+    # set up critical band filters. Note here that Gaussianly shaped filters
+    # are used. Also, the sum of the filter weights are equivalent for each
+    # critical band filter. Filter less than -30 dB and set to zero.
+    min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter
+
+    crit_filter = np.zeros((num_crit, n_fftby2))
+    all_f0 = []
+    for i in range(num_crit):
+        f0 = (cent_freq[i] / max_freq) * (n_fftby2)
+        all_f0.append(np.floor(f0))
+        bw = (bandwidth[i] / max_freq) * (n_fftby2)
+        norm_factor = np.log(bw_min) - np.log(bandwidth[i])
+        j = list(range(n_fftby2))
+        crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \
+                                   norm_factor)
+        crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \
+                                                 min_factor)
+
+    # For each frame of input speech, compute Weighted Spectral Slope Measure
+    num_frames = int(clean_length / skiprate - (winlength / skiprate))
+    start = 0 # starting sample
+    time = np.linspace(1, winlength, winlength) / (winlength + 1)
+    window = 0.5 * (1 - np.cos(2 * np.pi * time))
+    distortion = []
+
+    for frame_count in range(num_frames):
+        # (1) Get the Frames for the test and reference speeech.
+        # Multiply by Hanning window.
+        clean_frame = clean_speech[start:start+winlength]
+        processed_frame = processed_speech[start:start+winlength]
+        clean_frame = clean_frame * window
+        processed_frame = processed_frame * window
+
+        # (2) Compuet Power Spectrum of clean and processed
+        clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2)
+        processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2)
+        clean_energy = [None] * num_crit
+        processed_energy = [None] * num_crit
+
+        # (3) Compute Filterbank output energies (in dB)
+        for i in range(num_crit):
+            clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \
+                                     crit_filter[i, :])
+            processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \
+                                         crit_filter[i, :])
+        clean_energy = np.array(clean_energy).reshape(-1, 1)
+        eps = np.ones((clean_energy.shape[0], 1)) * 1e-10
+        clean_energy = np.concatenate((clean_energy, eps), axis=1)
+        clean_energy = 10 * np.log10(np.max(clean_energy, axis=1))
+        processed_energy = np.array(processed_energy).reshape(-1, 1)
+        processed_energy = np.concatenate((processed_energy, eps), axis=1)
+        processed_energy = 10 * np.log10(np.max(processed_energy, axis=1))
+
+        # (4) Compute Spectral Shape (dB[i+1] - dB[i])
+        clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1]
+        processed_slope = processed_energy[1:num_crit] - \
+                processed_energy[:num_crit-1]
+
+        # (5) Find the nearest peak locations in the spectra to each
+        # critical band. If the slope is negative, we search
+        # to the left. If positive, we search to the right.
+        clean_loc_peak = []
+        processed_loc_peak = []
+        for i in range(num_crit - 1):
+            if clean_slope[i] > 0:
+                # search to the right
+                n = i
+                while n < num_crit - 1 and clean_slope[n] > 0:
+                    n += 1
+                clean_loc_peak.append(clean_energy[n - 1])
+            else:
+                # search to the left
+                n = i
+                while n >= 0 and clean_slope[n] <= 0:
+                    n -= 1
+                clean_loc_peak.append(clean_energy[n + 1])
+            # find the peaks in the processed speech signal
+            if processed_slope[i] > 0:
+                n = i
+                while n < num_crit - 1 and processed_slope[n] > 0:
+                    n += 1
+                processed_loc_peak.append(processed_energy[n - 1])
+            else:
+                n = i
+                while n >= 0 and processed_slope[n] <= 0:
+                    n -= 1
+                processed_loc_peak.append(processed_energy[n + 1])
+
+        # (6) Compuet the WSS Measure for this frame. This includes
+        # determination of the weighting functino
+        dBMax_clean = max(clean_energy)
+        dBMax_processed = max(processed_energy)
+
+        # The weights are calculated by averaging individual
+        # weighting factors from the clean and processed frame.
+        # These weights W_clean and W_processed should range
+        # from 0 to 1 and place more emphasis on spectral 
+        # peaks and less emphasis on slope differences in spectral
+        # valleys.  This procedure is described on page 1280 of
+        # Klatt's 1982 ICASSP paper.
+        clean_loc_peak = np.array(clean_loc_peak)
+        processed_loc_peak = np.array(processed_loc_peak)
+        Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1])
+        Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \
+                                   clean_energy[:num_crit-1])
+        W_clean = Wmax_clean * Wlocmax_clean
+        Wmax_processed = Kmax / (Kmax + dBMax_processed - \
+                                processed_energy[:num_crit-1])
+        Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \
+                                      processed_energy[:num_crit-1])
+        W_processed = Wmax_processed * Wlocmax_processed
+        W = (W_clean + W_processed) / 2
+        distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \
+                                     processed_slope[:num_crit - 1]) ** 2))
+
+        # this normalization is not part of Klatt's paper, but helps
+        # to normalize the meaasure. Here we scale the measure by the sum of the
+        # weights
+        distortion[frame_count] = distortion[frame_count] / np.sum(W)
+        start += int(skiprate)
+    return distortion
+
+
+def llr(ref_wav, deg_wav, srate):
+    clean_speech = ref_wav
+    processed_speech = deg_wav
+    clean_length = ref_wav.shape[0]
+    processed_length = deg_wav.shape[0]
+    assert clean_length == processed_length, clean_length
+
+    winlength = round(30 * srate / 1000.) # 240 wlen in samples
+    skiprate = np.floor(winlength / 4)
+    if srate < 10000:
+        # LPC analysis order
+        P = 10
+    else:
+        P = 16
+
+    # For each frame of input speech, calculate the Log Likelihood Ratio
+    num_frames = int(clean_length / skiprate - (winlength / skiprate))
+    start = 0
+    time = np.linspace(1, winlength, winlength) / (winlength + 1)
+    window = 0.5 * (1 - np.cos(2 * np.pi * time))
+    distortion = []
+
+    for frame_count in range(num_frames):
+        # (1) Get the Frames for the test and reference speeech.
+        # Multiply by Hanning window.
+        clean_frame = clean_speech[start:start+winlength]
+        processed_frame = processed_speech[start:start+winlength]
+        clean_frame = clean_frame * window
+        processed_frame = processed_frame * window
+
+        # (2) Get the autocorrelation logs and LPC params used
+        # to compute the LLR measure
+        R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
+        R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
+        A_clean = A_clean[None, :]
+        A_processed = A_processed[None, :]
+
+        # (3) Compute the LLR measure
+        numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
+        denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
+
+        if (numerator/denominator) <= 0:
+            print(f'Numerator: {numerator}')
+            print(f'Denominator: {denominator}')
+
+        log_ = np.log(numerator / denominator)
+        distortion.append(np.squeeze(log_))
+        start += int(skiprate)
+    return np.nan_to_num(np.array(distortion))
+# -------------------------------------------------------------------------- #
diff --git a/scores/helper_bk.py b/scores/helper_bk.py
new file mode 100644
index 0000000000000000000000000000000000000000..a44c3010027808a509cb72998e38386e54e82f35
--- /dev/null
+++ b/scores/helper_bk.py
@@ -0,0 +1,438 @@
+"""
+Modifications in Metrics
+
+# Original copyright:
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Demucs (https://github.com/facebookresearch/denoiser) / author: adefossez
+"""
+import numpy as np
+from scipy.linalg import toeplitz
+
+# ----------------------------- HELPERS ------------------------------------ #
+def trim_mos(val):
+    return min(max(val, 1), 5)
+
+def lpcoeff(speech_frame, model_order):
+    # (1) Compute Autocor lags
+    winlength = speech_frame.shape[0]
+    R = []
+    for k in range(model_order + 1):
+        first  = speech_frame[:(winlength - k)]
+        second = speech_frame[k:winlength]
+        R.append(np.sum(first * second))
+
+    # (2) Lev-Durbin
+    a = np.ones((model_order,))
+    E = np.zeros((model_order + 1,))
+    rcoeff = np.zeros((model_order,))
+    E[0] = R[0]
+    for i in range(model_order):
+        if i == 0:
+            sum_term = 0
+        else:
+            a_past = a[:i]
+            sum_term = np.sum(a_past * np.array(R[i:0:-1]))
+        rcoeff[i] = (R[i+1] - sum_term)/E[i]
+        a[i] = rcoeff[i]
+        if i > 0:
+            a[:i] = a_past[:i] - rcoeff[i] * a_past[::-1]
+        E[i+1] = (1-rcoeff[i]*rcoeff[i])*E[i]
+    acorr    = np.array(R, dtype=np.float32)
+    refcoeff = np.array(rcoeff, dtype=np.float32)
+    a = a * -1
+    lpparams = np.array([1] + list(a), dtype=np.float32)
+    acorr    = np.array(acorr, dtype=np.float32)
+    refcoeff = np.array(refcoeff, dtype=np.float32)
+    lpparams = np.array(lpparams, dtype=np.float32)
+
+    return acorr, refcoeff, lpparams
+# -------------------------------------------------------------------------- #
+
+
+def SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
+    """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
+        This function implements the segmental signal-to-noise ratio
+        as defined in [1, p. 45] (see Equation 2.12).
+    """
+    clean_speech     = ref_wav
+    processed_speech = deg_wav
+    clean_length     = ref_wav.shape[0]
+    processed_length = deg_wav.shape[0]
+    
+    # scale both to have same dynamic range. Remove DC too.
+    clean_speech     -= clean_speech.mean()
+    processed_speech -= processed_speech.mean()
+    processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
+   
+    # Signal-to-Noise Ratio 
+    dif = ref_wav - deg_wav
+    overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) +
+                                                        10e-20))
+    # global variables
+    winlength = int(np.round(30 * srate / 1000)) # 30 msecs
+    skiprate  = winlength // 4
+    MIN_SNR   = -10
+    MAX_SNR   = 35
+
+    # For each frame, calculate SSNR
+    num_frames    = int(clean_length / skiprate - (winlength/skiprate))
+    start         = 0
+    time          = np.linspace(1, winlength, winlength) / (winlength + 1)
+    window        = 0.5 * (1 - np.cos(2 * np.pi * time))
+    segmental_snr = []
+
+    for frame_count in range(int(num_frames)):
+        # (1) get the frames for the test and ref speech.
+        # Apply Hanning Window
+        clean_frame     = clean_speech[start:start+winlength]
+        processed_frame = processed_speech[start:start+winlength]
+        clean_frame     = clean_frame * window
+        processed_frame = processed_frame * window
+
+        # (2) Compute Segmental SNR
+        signal_energy = np.sum(clean_frame ** 2)
+        noise_energy  = np.sum((clean_frame - processed_frame) ** 2)
+        segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
+        segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
+        segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
+        start += int(skiprate)
+    return overall_snr, segmental_snr
+
+
+def wss(ref_wav, deg_wav, srate):
+    clean_speech     = ref_wav
+    processed_speech = deg_wav
+    clean_length     = ref_wav.shape[0]
+    processed_length = deg_wav.shape[0]
+
+    assert clean_length == processed_length, clean_length
+
+    winlength = round(30 * srate / 1000.) # 240 wlen in samples
+    skiprate  = np.floor(winlength / 4)
+    max_freq  = srate / 2
+    num_crit  = 25 # num of critical bands
+
+    USE_FFT_SPECTRUM = 1
+    n_fft    = int(2 ** np.ceil(np.log(2*winlength)/np.log(2)))
+    n_fftby2 = int(n_fft / 2)
+    Kmax     = 20
+    Klocmax  = 1
+
+    # Critical band filter definitions (Center frequency and BW in Hz)
+    cent_freq = [50., 120, 190, 260, 330, 400, 470, 540, 617.372,
+                 703.378, 798.717, 904.128, 1020.38, 1148.30, 
+                 1288.72, 1442.54, 1610.70, 1794.16, 1993.93, 
+                 2211.08, 2446.71, 2701.97, 2978.04, 3276.17,
+                 3597.63]
+    bandwidth = [70., 70, 70, 70, 70, 70, 70, 77.3724, 86.0056,
+                 95.3398, 105.411, 116.256, 127.914, 140.423, 
+                 153.823, 168.154, 183.457, 199.776, 217.153, 
+                 235.631, 255.255, 276.072, 298.126, 321.465,
+                 346.136]
+
+    bw_min = bandwidth[0] # min critical bandwidth
+
+    # set up critical band filters. Note here that Gaussianly shaped filters
+    # are used. Also, the sum of the filter weights are equivalent for each
+    # critical band filter. Filter less than -30 dB and set to zero.
+    min_factor = np.exp(-30. / (2 * 2.303)) # -30 dB point of filter
+
+    crit_filter = np.zeros((num_crit, n_fftby2))
+    all_f0 = []
+    for i in range(num_crit):
+        f0 = (cent_freq[i] / max_freq) * (n_fftby2)
+        all_f0.append(np.floor(f0))
+        bw = (bandwidth[i] / max_freq) * (n_fftby2)
+        norm_factor = np.log(bw_min) - np.log(bandwidth[i])
+        j = list(range(n_fftby2))
+        crit_filter[i, :] = np.exp(-11 * (((j - np.floor(f0)) / bw) ** 2) + \
+                                   norm_factor)
+        crit_filter[i, :] = crit_filter[i, :] * (crit_filter[i, :] > \
+                                                 min_factor)
+
+    # For each frame of input speech, compute Weighted Spectral Slope Measure
+    num_frames = int(clean_length / skiprate - (winlength / skiprate))
+    start = 0 # starting sample
+    time = np.linspace(1, winlength, winlength) / (winlength + 1)
+    window = 0.5 * (1 - np.cos(2 * np.pi * time))
+    distortion = []
+
+    for frame_count in range(num_frames):
+        # (1) Get the Frames for the test and reference speeech.
+        # Multiply by Hanning window.
+        clean_frame = clean_speech[start:start+winlength]
+        processed_frame = processed_speech[start:start+winlength]
+        clean_frame = clean_frame * window
+        processed_frame = processed_frame * window
+
+        # (2) Compuet Power Spectrum of clean and processed
+        clean_spec = (np.abs(np.fft.fft(clean_frame, n_fft)) ** 2)
+        processed_spec = (np.abs(np.fft.fft(processed_frame, n_fft)) ** 2)
+        clean_energy = [None] * num_crit
+        processed_energy = [None] * num_crit
+
+        # (3) Compute Filterbank output energies (in dB)
+        for i in range(num_crit):
+            clean_energy[i] = np.sum(clean_spec[:n_fftby2] * \
+                                     crit_filter[i, :])
+            processed_energy[i] = np.sum(processed_spec[:n_fftby2] * \
+                                         crit_filter[i, :])
+        clean_energy = np.array(clean_energy).reshape(-1, 1)
+        eps = np.ones((clean_energy.shape[0], 1)) * 1e-10
+        clean_energy = np.concatenate((clean_energy, eps), axis=1)
+        clean_energy = 10 * np.log10(np.max(clean_energy, axis=1))
+        processed_energy = np.array(processed_energy).reshape(-1, 1)
+        processed_energy = np.concatenate((processed_energy, eps), axis=1)
+        processed_energy = 10 * np.log10(np.max(processed_energy, axis=1))
+
+        # (4) Compute Spectral Shape (dB[i+1] - dB[i])
+        clean_slope = clean_energy[1:num_crit] - clean_energy[:num_crit-1]
+        processed_slope = processed_energy[1:num_crit] - \
+                processed_energy[:num_crit-1]
+
+        # (5) Find the nearest peak locations in the spectra to each
+        # critical band. If the slope is negative, we search
+        # to the left. If positive, we search to the right.
+        clean_loc_peak = []
+        processed_loc_peak = []
+        for i in range(num_crit - 1):
+            if clean_slope[i] > 0:
+                # search to the right
+                n = i
+                while n < num_crit - 1 and clean_slope[n] > 0:
+                    n += 1
+                clean_loc_peak.append(clean_energy[n - 1])
+            else:
+                # search to the left
+                n = i
+                while n >= 0 and clean_slope[n] <= 0:
+                    n -= 1
+                clean_loc_peak.append(clean_energy[n + 1])
+            # find the peaks in the processed speech signal
+            if processed_slope[i] > 0:
+                n = i
+                while n < num_crit - 1 and processed_slope[n] > 0:
+                    n += 1
+                processed_loc_peak.append(processed_energy[n - 1])
+            else:
+                n = i
+                while n >= 0 and processed_slope[n] <= 0:
+                    n -= 1
+                processed_loc_peak.append(processed_energy[n + 1])
+
+        # (6) Compuet the WSS Measure for this frame. This includes
+        # determination of the weighting functino
+        dBMax_clean = max(clean_energy)
+        dBMax_processed = max(processed_energy)
+
+        # The weights are calculated by averaging individual
+        # weighting factors from the clean and processed frame.
+        # These weights W_clean and W_processed should range
+        # from 0 to 1 and place more emphasis on spectral 
+        # peaks and less emphasis on slope differences in spectral
+        # valleys.  This procedure is described on page 1280 of
+        # Klatt's 1982 ICASSP paper.
+        clean_loc_peak = np.array(clean_loc_peak)
+        processed_loc_peak = np.array(processed_loc_peak)
+        Wmax_clean = Kmax / (Kmax + dBMax_clean - clean_energy[:num_crit-1])
+        Wlocmax_clean = Klocmax / (Klocmax + clean_loc_peak - \
+                                   clean_energy[:num_crit-1])
+        W_clean = Wmax_clean * Wlocmax_clean
+        Wmax_processed = Kmax / (Kmax + dBMax_processed - \
+                                processed_energy[:num_crit-1])
+        Wlocmax_processed = Klocmax / (Klocmax + processed_loc_peak - \
+                                      processed_energy[:num_crit-1])
+        W_processed = Wmax_processed * Wlocmax_processed
+        W = (W_clean + W_processed) / 2
+        distortion.append(np.sum(W * (clean_slope[:num_crit - 1] - \
+                                     processed_slope[:num_crit - 1]) ** 2))
+
+        # this normalization is not part of Klatt's paper, but helps
+        # to normalize the meaasure. Here we scale the measure by the sum of the
+        # weights
+        distortion[frame_count] = distortion[frame_count] / np.sum(W)
+        start += int(skiprate)
+    return distortion
+
+
+def llr(ref_wav, deg_wav, srate):
+    clean_speech = ref_wav
+    processed_speech = deg_wav
+    clean_length = ref_wav.shape[0]
+    processed_length = deg_wav.shape[0]
+    assert clean_length == processed_length, clean_length
+
+    winlength = round(30 * srate / 1000.) # 240 wlen in samples
+    skiprate = np.floor(winlength / 4)
+    if srate < 10000:
+        # LPC analysis order
+        P = 10
+    else:
+        P = 16
+
+    # For each frame of input speech, calculate the Log Likelihood Ratio
+    num_frames = int(clean_length / skiprate - (winlength / skiprate))
+    start = 0
+    time = np.linspace(1, winlength, winlength) / (winlength + 1)
+    window = 0.5 * (1 - np.cos(2 * np.pi * time))
+    distortion = []
+
+    for frame_count in range(num_frames):
+        # (1) Get the Frames for the test and reference speeech.
+        # Multiply by Hanning window.
+        clean_frame = clean_speech[start:start+winlength]
+        processed_frame = processed_speech[start:start+winlength]
+        clean_frame = clean_frame * window
+        processed_frame = processed_frame * window
+
+        # (2) Get the autocorrelation logs and LPC params used
+        # to compute the LLR measure
+        R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
+        R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
+        A_clean = A_clean[None, :]
+        A_processed = A_processed[None, :]
+
+        # (3) Compute the LLR measure
+        numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
+        denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
+
+        if (numerator/denominator) <= 0:
+            print(f'Numerator: {numerator}')
+            print(f'Denominator: {denominator}')
+
+        log_ = np.log(numerator / denominator)
+        distortion.append(np.squeeze(log_))
+        start += int(skiprate)
+    return np.nan_to_num(np.array(distortion))
+# -------------------------------------------------------------------------- #
+
+#!/usr/bin/env python3
+
+# Copyright 2020 Wen-Chin Huang and Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# ported from https://github.com/espnet/espnet/blob/master/utils/mcd_calculate.py
+
+"""Evaluate MCD between generated and groundtruth audios with SPTK-based mcep."""
+
+from typing import Tuple
+
+import numpy as np
+import pysptk
+from fastdtw import fastdtw
+from scipy import spatial
+
+
+def sptk_extract(
+    x: np.ndarray,
+    fs: int,
+    n_fft: int = 512,
+    n_shift: int = 256,
+    mcep_dim: int = 25,
+    mcep_alpha: float = 0.41,
+    is_padding: bool = False,
+) -> np.ndarray:
+    """Extract SPTK-based mel-cepstrum.
+
+    Args:
+        x (ndarray): 1D waveform array.
+        fs (int): Sampling rate
+        n_fft (int): FFT length in point (default=512).
+        n_shift (int): Shift length in point (default=256).
+        mcep_dim (int): Dimension of mel-cepstrum (default=25).
+        mcep_alpha (float): All pass filter coefficient (default=0.41).
+        is_padding (bool): Whether to pad the end of signal (default=False).
+
+    Returns:
+        ndarray: Mel-cepstrum with the size (N, n_fft).
+
+    """
+    # perform padding
+    if is_padding:
+        n_pad = n_fft - (len(x) - n_fft) % n_shift
+        x = np.pad(x, (0, n_pad), "reflect")
+
+    # get number of frames
+    n_frame = (len(x) - n_fft) // n_shift + 1
+
+    # get window function
+    win = pysptk.sptk.hamming(n_fft)
+
+    # check mcep and alpha
+    if mcep_dim is None or mcep_alpha is None:
+        mcep_dim, mcep_alpha = _get_best_mcep_params(fs)
+
+    # calculate spectrogram
+    mcep = [
+        pysptk.mcep(
+            x[n_shift * i : n_shift * i + n_fft] * win,
+            mcep_dim,
+            mcep_alpha,
+            eps=1e-6,
+            etype=1,
+        )
+        for i in range(n_frame)
+    ]
+
+    return np.stack(mcep)
+
+
+def _get_best_mcep_params(fs: int) -> Tuple[int, float]:
+    # https://sp-nitech.github.io/sptk/latest/main/mgcep.html#_CPPv4N4sptk19MelCepstralAnalysisE
+    if fs == 8000:
+        return 13, 0.31
+    elif fs == 16000:
+        return 23, 0.42
+    elif fs == 22050:
+        return 34, 0.45
+    elif fs == 24000:
+        return 34, 0.46
+    elif fs == 32000:
+        return 36, 0.50
+    elif fs == 44100:
+        return 39, 0.53
+    elif fs == 48000:
+        return 39, 0.55
+    else:
+        raise ValueError(f"Not found the setting for {fs}.")
+
+
+def calculate_mcd(
+    inf_audio,
+    ref_audio,
+    fs,
+    n_fft=1024,
+    n_shift=256,
+    mcep_dim=None,
+    mcep_alpha=None,
+):
+    """Calculate MCD."""
+
+    # extract ground truth and converted features
+    gen_mcep = sptk_extract(
+        x=inf_audio,
+        fs=fs,
+        n_fft=n_fft,
+        n_shift=n_shift,
+        mcep_dim=mcep_dim,
+        mcep_alpha=mcep_alpha,
+    )
+    gt_mcep = sptk_extract(
+        x=ref_audio,
+        fs=fs,
+        n_fft=n_fft,
+        n_shift=n_shift,
+        mcep_dim=mcep_dim,
+        mcep_alpha=mcep_alpha,
+    )
+
+    # DTW
+    _, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean)
+    twf = np.array(path).T
+    gen_mcep_dtw = gen_mcep[twf[0]]
+    gt_mcep_dtw = gt_mcep[twf[1]]
+
+    # MCD
+    diff2sum = np.sum((gen_mcep_dtw - gt_mcep_dtw) ** 2, 1)
+    mcd = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)
+
+    return mcd
diff --git a/scores/llr.py b/scores/llr.py
new file mode 100644
index 0000000000000000000000000000000000000000..072b47eeecbc119586814313b4bc050b1eca6cdb
--- /dev/null
+++ b/scores/llr.py
@@ -0,0 +1,66 @@
+from basis import ScoreBasis
+import numpy as np
+from scipy.linalg import toeplitz
+from scores.helper import lpcoeff
+
+class LLR(ScoreBasis):
+    def __init__(self):
+        super(LLR, self).__init__(name='LLR')
+        self.intrusive = False
+
+    def windowed_scoring(self, audios, score_rate):
+        if len(audios) != 2:
+            raise ValueError('LLR needs a reference and a test signals.')
+        return cal_LLR(audios[0], audios[1], score_rate)
+
+def cal_LLR(ref_wav, deg_wav, srate):
+    # obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py
+    clean_speech = ref_wav
+    processed_speech = deg_wav
+    clean_length = ref_wav.shape[0]
+    processed_length = deg_wav.shape[0]
+    assert clean_length == processed_length, clean_length
+
+    winlength = round(30 * srate / 1000.) # 240 wlen in samples
+    skiprate = np.floor(winlength / 4)
+    if srate < 10000:
+        # LPC analysis order
+        P = 10
+    else:
+        P = 16
+
+    # For each frame of input speech, calculate the Log Likelihood Ratio
+    num_frames = int(clean_length / skiprate - (winlength / skiprate))
+    start = 0
+    time = np.linspace(1, winlength, winlength) / (winlength + 1)
+    window = 0.5 * (1 - np.cos(2 * np.pi * time))
+    distortion = []
+
+    for frame_count in range(num_frames):
+        # (1) Get the Frames for the test and reference speeech.
+        # Multiply by Hanning window.
+        clean_frame = clean_speech[start:start+winlength]
+        processed_frame = processed_speech[start:start+winlength]
+        clean_frame = clean_frame * window
+        processed_frame = processed_frame * window
+
+        # (2) Get the autocorrelation logs and LPC params used
+        # to compute the LLR measure
+        R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
+        R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
+        A_clean = A_clean[None, :]
+        A_processed = A_processed[None, :]
+
+        # (3) Compute the LLR measure
+        numerator = A_processed.dot(toeplitz(R_clean)).dot(A_processed.T)
+        denominator = A_clean.dot(toeplitz(R_clean)).dot(A_clean.T)
+
+        if (numerator/denominator) <= 0:
+            print(f'Numerator: {numerator}')
+            print(f'Denominator: {denominator}')
+
+        log_ = np.log(numerator / denominator)
+        distortion.append(np.squeeze(log_))
+        start += int(skiprate)
+    return np.mean(np.nan_to_num(np.array(distortion)))
+
diff --git a/scores/lsd.py b/scores/lsd.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d1650e4af80fd2b39a5a5b864bec8e5495042da
--- /dev/null
+++ b/scores/lsd.py
@@ -0,0 +1,30 @@
+from basis import ScoreBasis
+import numpy as np
+import librosa
+
+EPS = 1e-12
+
+class LSD(ScoreBasis):
+    def __init__(self):
+        super(LSD, self).__init__(name='LSD')
+        self.intrusive = False
+        self.mono = True
+
+    def windowed_scoring(self, audios, score_rate):
+        if len(audios) != 2:
+            raise ValueError('NB_PESQ needs a reference and a test signals.')
+        est = wav_to_spectrogram(audios[1], score_rate)
+        target = wav_to_spectrogram(audios[0], score_rate)
+        return cal_LSD(est, target)
+
+def wav_to_spectrogram(wav, rate):
+    hop_length = int(rate / 100)
+    n_fft = int(2048 / (48000 / rate)) 
+    spec = np.abs(librosa.stft(wav, hop_length=hop_length, n_fft=n_fft))
+    spec = np.transpose(spec, (1, 0))
+    return spec
+
+def cal_LSD(est, target):
+    log_ratio = np.log10(target**2 / ((est + EPS) ** 2) + EPS) ** 2
+    lsd_ = np.mean(np.mean(log_ratio, axis=1) ** 0.5, axis=0)
+    return lsd_
diff --git a/scores/mcd.py b/scores/mcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..8190672cb94674d3a7873d1434bfb107c1d914a1
--- /dev/null
+++ b/scores/mcd.py
@@ -0,0 +1,136 @@
+from basis import ScoreBasis
+import librosa
+import math
+import numpy as np
+import pyworld
+import pysptk
+from fastdtw import fastdtw
+from scipy.spatial.distance import euclidean
+#from scores.helper import calculate_mcd
+#from pymcd.mcd import Calculate_MCD
+#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
+class MCD(ScoreBasis):
+    def __init__(self):
+        super(MCD, self).__init__(name='MCD')
+        self.intrusive = False
+        # three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics 
+        self.mcd_toolbox = Calculate_MCD(MCD_mode="plain")
+
+    def windowed_scoring(self, audios, score_rate):
+        if len(audios) != 2:
+            raise ValueError('MCD needs a reference and a test signals.')
+        return self.mcd_toolbox.calculate_mcd(audios[1], audios[0], score_rate)
+
+# ================================================= #
+# calculate the Mel-Cepstral Distortion (MCD) value #
+# ================================================= #
+#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
+class Calculate_MCD(object):
+    """docstring for Calculate_MCD"""
+    def __init__(self, MCD_mode):
+        super(Calculate_MCD, self).__init__()
+        self.MCD_mode = MCD_mode
+        #self.SAMPLING_RATE = 22050
+        self.FRAME_PERIOD = 5.0
+        self.log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) # 6.141851463713754
+	
+    def load_wav(self, wav_file, sample_rate):
+        """
+        Load a wav file with librosa.
+        :param wav_file: path to wav file
+        :param sr: sampling rate
+        :return: audio time series numpy array
+        """
+        wav, _ = librosa.load(wav_file, sr=sample_rate, mono=True)
+        return wav
+
+    # distance metric
+    def log_spec_dB_dist(self, x, y):
+        # log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
+        diff = x - y
+        return self.log_spec_dB_const * math.sqrt(np.inner(diff, diff))
+
+    # calculate distance (metric)
+    # def calculate_mcd_distance(self, x, y, distance, path):
+    def calculate_mcd_distance(self, x, y, path):
+        '''
+        param path: pairs between x and y
+        '''
+        pathx = list(map(lambda l: l[0], path))
+        pathy = list(map(lambda l: l[1], path))
+        x, y = x[pathx], y[pathy]
+        frames_tot = x.shape[0]       # length of pairs
+
+        z = x - y
+        min_cost_tot = np.sqrt((z * z).sum(-1)).sum()
+
+        return frames_tot, min_cost_tot
+
+    # extract acoustic features
+    # alpha = 0.65  # commonly used at 22050 Hz
+    def wav2mcep_numpy(self, loaded_wav, score_rate=22050, alpha=0.65, fft_size=512):
+
+        # Use WORLD vocoder to spectral envelope
+        _, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=score_rate,
+                                     frame_period=self.FRAME_PERIOD, fft_size=fft_size)
+        # Extract MCEP features
+        mcep = pysptk.sptk.mcep(sp, order=13, alpha=alpha, maxiter=0,
+                                etype=1, eps=1.0E-8, min_det=0.0, itype=3)
+
+        return mcep
+
+    # calculate the Mel-Cepstral Distortion (MCD) value
+    #def average_mcd(self, ref_audio_file, syn_audio_file, cost_function, MCD_mode):
+    def average_mcd(self, loaded_ref_wav, loaded_syn_wav, cost_function, MCD_mode, score_rate):
+        """
+        Calculate the average MCD.
+        :param ref_mcep_files: list of strings, paths to MCEP target reference files
+        :param synth_mcep_files: list of strings, paths to MCEP converted synthesised files
+        :param cost_function: distance metric used
+        :param plain: if plain=True, use Dynamic Time Warping (dtw)
+        :returns: average MCD, total frames processed
+        """
+        # load wav from given wav file
+        #loaded_ref_wav = self.load_wav(ref_audio_file, sample_rate=self.SAMPLING_RATE)
+        #loaded_syn_wav = self.load_wav(syn_audio_file, sample_rate=self.SAMPLING_RATE)
+
+        if MCD_mode == "plain":
+            # pad 0
+            if len(loaded_ref_wav)<len(loaded_syn_wav):
+                loaded_ref_wav = np.pad(loaded_ref_wav, (0, len(loaded_syn_wav)-len(loaded_ref_wav)))
+            else:
+                loaded_syn_wav = np.pad(loaded_syn_wav, (0, len(loaded_ref_wav)-len(loaded_syn_wav)))
+
+            # extract MCEP features (vectors): 2D matrix (num x mcep_size)
+            ref_mcep_vec = self.wav2mcep_numpy(loaded_ref_wav, score_rate)
+            syn_mcep_vec = self.wav2mcep_numpy(loaded_syn_wav, score_rate)
+
+            if MCD_mode == "plain":
+                # print("Calculate plain MCD ...")
+                path = []
+                # for i in range(num_temp):
+                for i in range(len(ref_mcep_vec)):
+                    path.append((i, i))
+            elif MCD_mode == "dtw":
+                # print("Calculate MCD-dtw ...")
+                _, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
+            elif MCD_mode == "dtw_sl":
+                # print("Calculate MCD-dtw-sl ...")
+                cof = len(ref_mcep_vec)/len(syn_mcep_vec) if len(ref_mcep_vec)>len(syn_mcep_vec) else len(syn_mcep_vec)/len(ref_mcep_vec)
+                _, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
+
+            frames_tot, min_cost_tot = self.calculate_mcd_distance(ref_mcep_vec, syn_mcep_vec, path)
+
+            if MCD_mode == "dtw_sl":
+                mean_mcd = cof * self.log_spec_dB_const * min_cost_tot / frames_tot
+            else:
+                mean_mcd = self.log_spec_dB_const * min_cost_tot / frames_tot
+
+            return mean_mcd
+
+    # calculate mcd
+    def calculate_mcd(self, reference_audio, synthesized_audio, score_rate):
+        # extract acoustic features
+        mean_mcd = self.average_mcd(reference_audio, synthesized_audio, self.log_spec_dB_dist, self.MCD_mode, score_rate)
+
+        return mean_mcd
diff --git a/scores/mosnet/__init__.py b/scores/mosnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab28a6d887beb81688fca72f11f27b0dd950d570
--- /dev/null
+++ b/scores/mosnet/__init__.py
@@ -0,0 +1,21 @@
+def load(window, hop=None):
+    import tensorflow as tf
+    from .model import MOSNet
+    tf.debugging.set_log_device_placement(False)
+    # set memory growth
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    if gpus:
+        try:
+            # Currently, memory growth needs to be the same across GPUs
+            for gpu in gpus:
+                tf.config.experimental.set_memory_growth(gpu, True)
+
+            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
+            print(len(gpus), "Physical GPUs,",
+                  len(logical_gpus), "Logical GPUs")
+        except RuntimeError as e:
+            # Memory growth must be set before GPUs have been initialized
+            print(e)
+
+    mosnet = MOSNet(window, hop)
+    return mosnet
diff --git a/scores/mosnet/__pycache__/__init__.cpython-38.pyc b/scores/mosnet/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6682098d66c7e99942959d964f1c9430174ffff5
Binary files /dev/null and b/scores/mosnet/__pycache__/__init__.cpython-38.pyc differ
diff --git a/scores/mosnet/cnn_blstm.h5 b/scores/mosnet/cnn_blstm.h5
new file mode 100644
index 0000000000000000000000000000000000000000..5f28d003dcf7d8a3887b8c9227957ae9fcacef3b
--- /dev/null
+++ b/scores/mosnet/cnn_blstm.h5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78b75e7d76ee6074ea7d57dcffa56d0c90be9d3d8dedc2217e25e259423cb756
+size 14248464
diff --git a/scores/mosnet/model.py b/scores/mosnet/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb472c61ed5b811ed992446d3f8e65912497168
--- /dev/null
+++ b/scores/mosnet/model.py
@@ -0,0 +1,107 @@
+from tensorflow import keras
+from tensorflow.keras import Model, layers
+from tensorflow.keras.layers import Dense, Dropout, Conv2D
+from tensorflow.keras.layers import LSTM, TimeDistributed, Bidirectional
+from tensorflow.keras.constraints import max_norm
+import librosa
+import scipy
+import numpy as np
+import os
+from ... import Metric
+
+# prevent TF warnings
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+
+
+class MOSNet(Metric):
+    def __init__(self, window, hop=None):
+        super(MOSNet, self).__init__(name='MOSNet', window=window, hop=hop)
+
+        # constants
+        self.fixed_rate = 16000
+        self.mono = True
+        self.absolute = True
+
+        self.FFT_SIZE = 512
+        self.SGRAM_DIM = self.FFT_SIZE // 2 + 1
+        self.HOP_LENGTH = 256
+        self.WIN_LENGTH = 512
+
+        _input = keras.Input(shape=(None, 257))
+
+        re_input = layers.Reshape((-1, 257, 1), input_shape=(-1, 257))(_input)
+
+        # CNN
+        conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu',
+                 padding='same'))(re_input)
+        conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu',
+                 padding='same'))(conv1)
+        conv1 = (Conv2D(16, (3, 3), strides=(1, 3), activation='relu',
+                 padding='same'))(conv1)
+
+        conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu',
+                 padding='same'))(conv1)
+        conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu',
+                 padding='same'))(conv2)
+        conv2 = (Conv2D(32, (3, 3), strides=(1, 3), activation='relu',
+                 padding='same'))(conv2)
+
+        conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu',
+                 padding='same'))(conv2)
+        conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu',
+                 padding='same'))(conv3)
+        conv3 = (Conv2D(64, (3, 3), strides=(1, 3), activation='relu',
+                 padding='same'))(conv3)
+
+        conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu',
+                 padding='same'))(conv3)
+        conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu',
+                 padding='same'))(conv4)
+        conv4 = (Conv2D(128, (3, 3), strides=(1, 3), activation='relu',
+                 padding='same'))(conv4)
+
+        re_shape = layers.Reshape((-1, 4*128), input_shape=(-1, 4, 128))(conv4)
+
+        # BLSTM
+        blstm1 = Bidirectional(
+            LSTM(128, return_sequences=True, dropout=0.3,
+                 recurrent_dropout=0.3,
+                 recurrent_constraint=max_norm(0.00001)),
+            merge_mode='concat')(re_shape)
+
+        # DNN
+        flatten = TimeDistributed(layers.Flatten())(blstm1)
+        dense1 = TimeDistributed(Dense(128, activation='relu'))(flatten)
+        dense1 = Dropout(0.3)(dense1)
+
+        frame_score = TimeDistributed(Dense(1), name='frame')(dense1)
+        import warnings
+
+        average_score = layers.GlobalAveragePooling1D(name='avg')(frame_score)
+
+        self.model = Model(outputs=[average_score, frame_score], inputs=_input)
+
+        # weights are in the directory of this file
+        pre_trained_dir = os.path.dirname(__file__)
+
+        # load pre-trained weights. CNN_BLSTM is reported as best
+        self.model.load_weights(os.path.join(pre_trained_dir, 'cnn_blstm.h5'))
+
+    def test_window(self, audios, rate):
+        # stft. D: (1+n_fft//2, T)
+        linear = librosa.stft(y=np.asfortranarray(audios[0]),
+                              n_fft=self.FFT_SIZE,
+                              hop_length=self.HOP_LENGTH,
+                              win_length=self.WIN_LENGTH,
+                              window=scipy.signal.hamming,
+                              )
+
+        # magnitude spectrogram
+        mag = np.abs(linear)  # (1+n_fft/2, T)
+
+        # shape in (T, 1+n_fft/2)
+        mag = np.transpose(mag.astype(np.float32))
+
+        # now call the actual MOSnet
+        return {'mosnet':
+                self.model.predict(mag[None, ...], verbose=0, batch_size=1)[0]}
diff --git a/scores/nb_pesq.py b/scores/nb_pesq.py
new file mode 100644
index 0000000000000000000000000000000000000000..02bd009db1adbcb324fcfe087376ec0520743493
--- /dev/null
+++ b/scores/nb_pesq.py
@@ -0,0 +1,15 @@
+from basis import ScoreBasis
+
+
+class NB_PESQ(ScoreBasis):
+    def __init__(self):
+        super(NB_PESQ, self).__init__(name='NB_PESQ')
+        self.intrusive = False
+        self.score_rate = 16000
+
+    def windowed_scoring(self, audios, score_rate):
+        from pypesq import pesq
+        if len(audios) != 2:
+            raise ValueError('NB_PESQ needs a reference and a test signals.')
+        return pesq(audios[1], audios[0], score_rate)
+
diff --git a/scores/pesq.py b/scores/pesq.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff016786fad0e5bcc98a0267c9863dd646c1d456
--- /dev/null
+++ b/scores/pesq.py
@@ -0,0 +1,15 @@
+from basis import ScoreBasis
+
+class PESQ(ScoreBasis):
+    def __init__(self):
+        super(PESQ, self).__init__(name='PESQ')
+        self.intrusive = False
+        self.mono = True
+        self.fixed_rate = 16000
+
+    def windowed_scoring(self, audios, rate):
+        from pesq import pesq
+        if len(audios) != 2:
+            raise ValueError('PESQ needs a reference and a test signals.')
+        return pesq(rate, audios[1], audios[0], 'wb')
+
diff --git a/scores/sisdr.py b/scores/sisdr.py
new file mode 100644
index 0000000000000000000000000000000000000000..142b89f21f2ad35678fa38f66c99d01790bfc7e2
--- /dev/null
+++ b/scores/sisdr.py
@@ -0,0 +1,32 @@
+from basis import ScoreBasis
+import numpy as np
+from numpy.linalg import norm
+
+class SISDR(ScoreBasis):
+    def __init__(self):
+        super(SISDR, self).__init__(name='SISDR')
+        self.intrusive = False
+
+    def windowed_scoring(self, audios, score_rate):
+        # as provided by @Jonathan-LeRoux and slightly adapted for the case of just one reference
+        # and one estimate.
+        # see original code here: https://github.com/sigsep/bsseval/issues/3#issuecomment-494995846
+        if len(audios) != 2:
+            raise ValueError('PESQ needs a reference and a test signals.')
+        eps = np.finfo(audios[0].dtype).eps
+        reference = audios[1].reshape(audios[1].size, 1)
+        estimate = audios[0].reshape(audios[0].size, 1)
+        
+        Rss = np.dot(reference.T, reference)
+
+        # get the scaling factor for clean sources
+        a = (eps + np.dot(reference.T, estimate)) / (Rss + eps)
+
+        e_true = a * reference
+        e_res = estimate - e_true
+
+        Sss = (e_true**2).sum()
+        Snn = (e_res**2).sum()
+
+        return 10 * np.log10((eps+ Sss)/(eps + Snn))
+
diff --git a/scores/snr.py b/scores/snr.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0818e04db37d0aa3b91be9561abaa464839f744
--- /dev/null
+++ b/scores/snr.py
@@ -0,0 +1,33 @@
+from basis import ScoreBasis
+import numpy as np
+
+class SNR(ScoreBasis):
+    def __init__(self):
+        super(SNR, self).__init__(name='SNR')
+        self.intrusive = False
+
+    def windowed_scoring(self, audios, score_rate):
+        if len(audios) != 2:
+            raise ValueError('SNR needs a reference and a test signals.')
+        return cal_SNR(audios[0], audios[1], score_rate)
+
+def cal_SNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
+    # obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py
+    """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
+        This function implements the segmental signal-to-noise ratio
+        as defined in [1, p. 45] (see Equation 2.12).
+    """
+    clean_speech     = ref_wav
+    processed_speech = deg_wav
+    clean_length     = ref_wav.shape[0]
+    processed_length = deg_wav.shape[0]
+    
+    # scale both to have same dynamic range. Remove DC too.
+    clean_speech     -= clean_speech.mean()
+    processed_speech -= processed_speech.mean()
+    processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
+   
+    # Signal-to-Noise Ratio 
+    dif = ref_wav - deg_wav
+    overall_snr = 10 * np.log10(np.sum(ref_wav ** 2) / (np.sum(dif ** 2) + 10e-20))
+    return overall_snr
diff --git a/scores/srmr/LICENSE.md b/scores/srmr/LICENSE.md
new file mode 100644
index 0000000000000000000000000000000000000000..9bbbb29ce6e21137146cc67aab4b550782dec099
--- /dev/null
+++ b/scores/srmr/LICENSE.md
@@ -0,0 +1,22 @@
+The SRMRpy toolbox is licensed under the MIT license. 
+
+> Copyright (c) 2014 João F. Santos, Tiago H. Falk
+> 
+> Permission is hereby granted, free of charge, to any person obtaining a copy
+> of this software and associated documentation files (the "Software"), to deal
+> in the Software without restriction, including without limitation the rights
+> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+> copies of the Software, and to permit persons to whom the Software is
+> furnished to do so, subject to the following conditions:
+> 
+> The above copyright notice and this permission notice shall be included in all
+> copies or substantial portions of the Software.
+> 
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+> SOFTWARE.
+
diff --git a/scores/srmr/__pycache__/__init__.cpython-38.pyc b/scores/srmr/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2b8d048cf938935d45f0ac2772d5195f526f440
Binary files /dev/null and b/scores/srmr/__pycache__/__init__.cpython-38.pyc differ
diff --git a/scores/srmr/__pycache__/cal_srmr.cpython-38.pyc b/scores/srmr/__pycache__/cal_srmr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e23a9948a5ac3ffcc774276bee441beafb0e235e
Binary files /dev/null and b/scores/srmr/__pycache__/cal_srmr.cpython-38.pyc differ
diff --git a/scores/srmr/__pycache__/hilbert.cpython-38.pyc b/scores/srmr/__pycache__/hilbert.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd002a203a4a92650082bca1b50345d80ff6fe51
Binary files /dev/null and b/scores/srmr/__pycache__/hilbert.cpython-38.pyc differ
diff --git a/scores/srmr/__pycache__/metric_srmr.cpython-38.pyc b/scores/srmr/__pycache__/metric_srmr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b37e45deb829a7fbda422f7ab3172f39b6f86fab
Binary files /dev/null and b/scores/srmr/__pycache__/metric_srmr.cpython-38.pyc differ
diff --git a/scores/srmr/__pycache__/modulation_filters.cpython-38.pyc b/scores/srmr/__pycache__/modulation_filters.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0dda1afb314dbd08b200ec9d63853f0ae233b1f5
Binary files /dev/null and b/scores/srmr/__pycache__/modulation_filters.cpython-38.pyc differ
diff --git a/scores/srmr/__pycache__/srmr.cpython-38.pyc b/scores/srmr/__pycache__/srmr.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5069cc0060f0ed39c1969bb993ea53445fee8116
Binary files /dev/null and b/scores/srmr/__pycache__/srmr.cpython-38.pyc differ
diff --git a/scores/srmr/cal_srmr.py b/scores/srmr/cal_srmr.py
new file mode 100644
index 0000000000000000000000000000000000000000..38c006502d814eca69617fa193d7f39c63915b1f
--- /dev/null
+++ b/scores/srmr/cal_srmr.py
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+# Copyright 2014 João Felipe Santos, jfsantos@emt.inrs.ca
+#
+# This file is part of the SRMRpy library, and is licensed under the
+# MIT license: https://github.com/jfsantos/SRMRpy/blob/master/LICENSE
+
+from __future__ import division
+import numpy as np
+from scipy.signal import hamming
+from .hilbert import hilbert
+from .modulation_filters import compute_modulation_cfs, modulation_filterbank,\
+                                modfilt
+from gammatone.fftweight import fft_gtgram
+from gammatone.filters import centre_freqs, make_erb_filters, erb_filterbank
+from srmrpy.segmentaxis import segment_axis
+
+from scipy.io.wavfile import read as readwav
+
+
+def calc_erbs(low_freq, fs, n_filters):
+    ear_q = 9.26449  # Glasberg and Moore Parameters
+    min_bw = 24.7
+    order = 1
+
+    erbs = ((centre_freqs(fs, n_filters, low_freq)/ear_q)**order
+            + min_bw**order)**(1/order)
+    return erbs
+
+
+def calc_cutoffs(cfs, fs, q):
+    # Calculates cutoff frequencies (3 dB) for 2nd order bandpass
+    w0 = 2*np.pi*cfs/fs
+    B0 = np.tan(w0/2)/q
+    L = cfs - (B0 * fs / (2*np.pi))
+    R = cfs + (B0 * fs / (2*np.pi))
+    return L, R
+
+
+def normalize_energy(energy, drange=30.0):
+    peak_energy = np.max(np.mean(energy, axis=0))
+    min_energy = peak_energy*10.0**(-drange/10.0)
+    energy[energy < min_energy] = min_energy
+    energy[energy > peak_energy] = peak_energy
+    return energy
+
+
+def cal_SRMR(x, fs, n_cochlear_filters=23, low_freq=125, min_cf=4, max_cf=128,
+         fast=True, norm=False):
+    wLengthS = .256
+    wIncS = .064
+    # Computing gammatone envelopes
+    if fast:
+        mfs = 400.0
+        gt_env = fft_gtgram(x, fs, 0.010, 0.0025, n_cochlear_filters, low_freq)
+    else:
+        cfs = centre_freqs(fs, n_cochlear_filters, low_freq)
+        fcoefs = make_erb_filters(fs, cfs)
+        gt_env = np.abs(hilbert(erb_filterbank(x, fcoefs)))
+        mfs = fs
+
+    wLength = int(np.ceil(wLengthS*mfs))
+    wInc = int(np.ceil(wIncS*mfs))
+
+    # Computing modulation filterbank with Q = 2 and 8 channels
+    mod_filter_cfs = compute_modulation_cfs(min_cf, max_cf, 8)
+    MF = modulation_filterbank(mod_filter_cfs, mfs, 2)
+
+    n_frames = int(1 + (gt_env.shape[1] - wLength)//wInc)
+    w = hamming(wLength+1)[:-1]  # window is periodic, not symmetric
+
+    energy = np.zeros((n_cochlear_filters, 8, n_frames))
+    for i, ac_ch in enumerate(gt_env):
+        mod_out = modfilt(MF, ac_ch)
+        for j, mod_ch in enumerate(mod_out):
+            mod_out_frame = segment_axis(mod_ch, wLength,
+                                         overlap=wLength-wInc,
+                                         end='pad')
+            energy[i, j, :] = np.sum((w*mod_out_frame[:n_frames])**2, axis=1)
+
+    if norm:
+        energy = normalize_energy(energy)
+
+    erbs = np.flipud(calc_erbs(low_freq, fs, n_cochlear_filters))
+
+    avg_energy = np.mean(energy, axis=2)
+    total_energy = np.sum(avg_energy)
+
+    AC_energy = np.sum(avg_energy, axis=1)
+    AC_perc = AC_energy*100/total_energy
+
+    AC_perc_cumsum = np.cumsum(np.flipud(AC_perc))
+    K90perc_idx = np.where(AC_perc_cumsum > 90)[0][0]
+
+    BW = erbs[K90perc_idx]
+
+    cutoffs = calc_cutoffs(mod_filter_cfs, fs, 2)[0]
+
+    if (BW > cutoffs[4]) and (BW < cutoffs[5]):
+        Kstar = 5
+    elif (BW > cutoffs[5]) and (BW < cutoffs[6]):
+        Kstar = 6
+    elif (BW > cutoffs[6]) and (BW < cutoffs[7]):
+        Kstar = 7
+    elif (BW > cutoffs[7]):
+        Kstar = 8
+
+    return np.sum(avg_energy[:, :4])/np.sum(avg_energy[:, 4:Kstar]), energy
+
+
+def process_file(f, args):
+    fs, s = readwav(f)
+    if len(s.shape) > 1:
+        s = s[:, 0]
+    if np.issubdtype(s.dtype, np.int):
+        s = s.astype('float')/np.iinfo(s.dtype).max
+    r, energy = srmr(
+            s, fs, n_cochlear_filters=args.n_cochlear_filters,
+            min_cf=args.min_cf,
+            max_cf=args.max_cf,
+            fast=args.fast,
+            norm=args.norm)
+    return f, r
+
+
+def main():
+    import argparse
+    import multiprocessing
+    import functools
+
+    parser = argparse.ArgumentParser(
+        description='Compute the SRMR metric for a given WAV file')
+    parser.add_argument(
+        '-f', '--fast', dest='fast', action='store_true', default=False,
+        help='Use the faster version based on the gammatonegram')
+    parser.add_argument(
+        '-n', '--norm', dest='norm', action='store_true', default=False,
+        help='Use modulation spectrum energy normalization')
+    parser.add_argument(
+        '--ncochlearfilters', dest='n_cochlear_filters', type=int, default=23,
+        help='Number of filters in the acoustic filterbank')
+    parser.add_argument(
+        '--mincf', dest='min_cf', type=float, default=4.0,
+        help='Center frequency of the first modulation filter')
+    parser.add_argument(
+        '--maxcf', dest='max_cf', type=float, default=128.0,
+        help='Center frequency of the last modulation filter')
+    parser.add_argument(
+        'path', metavar='path', nargs='+',
+        help='Path of the file or files to be processed.'
+             ' Can also be a folder.')
+    args = parser.parse_args()
+
+    if len(args.path) > 1:
+        p = multiprocessing.Pool(multiprocessing.cpu_count())
+        results = dict(p.map(functools.partial(process_file, args=args),
+                       args.path))
+        for f in args.path:
+            print('{}: {}'.format(f, results[f]))
+    else:
+        f, r = process_file(args.path[0], args)
+        print('{}: {}'.format(f, r))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scores/srmr/hilbert.py b/scores/srmr/hilbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f0ba1c3f2d486e13367ab81320ba71d8e874392
--- /dev/null
+++ b/scores/srmr/hilbert.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+# Copyright 2014 João Felipe Santos, jfsantos@emt.inrs.ca
+#
+# This file is part of the SRMRpy library, and is licensed under the
+# MIT license: https://github.com/jfsantos/SRMRpy/blob/master/LICENSE
+
+import numpy as np
+from numpy.fft import fft, ifft
+
+# This is copied straight from scipy.signal. The reason is that scipy.signal's version
+# will always use the fft and ifft functions from fftpack. If you have Anaconda with an MKL
+# license, you can install the package mklfft, which will plug the faster MKL FFT functions
+# into numpy.
+
+def hilbert(x, N=None, axis=-1):
+    """
+    Compute the analytic signal, using the Hilbert transform.
+    The transformation is done along the last axis by default.
+    Parameters
+    ----------
+    x : array_like
+        Signal data.  Must be real.
+    N : int, optional
+        Number of Fourier components.  Default: ``x.shape[axis]``
+    axis : int, optional
+        Axis along which to do the transformation.  Default: -1.
+    Returns
+    -------
+    xa : ndarray
+        Analytic signal of `x`, of each 1-D array along `axis`
+    Notes
+    -----
+    The analytic signal ``x_a(t)`` of signal ``x(t)`` is:
+    .. math:: x_a = F^{-1}(F(x) 2U) = x + i y
+    where `F` is the Fourier transform, `U` the unit step function,
+    and `y` the Hilbert transform of `x`. [1]_
+    In other words, the negative half of the frequency spectrum is zeroed
+    out, turning the real-valued signal into a complex signal.  The Hilbert
+    transformed signal can be obtained from ``np.imag(hilbert(x))``, and the
+    original signal from ``np.real(hilbert(x))``.
+    References
+    ----------
+    .. [1] Wikipedia, "Analytic signal".
+           http://en.wikipedia.org/wiki/Analytic_signal
+    """
+    x = np.asarray(x)
+    if np.iscomplexobj(x):
+        raise ValueError("x must be real.")
+    if N is None:
+        N = x.shape[axis]
+        # Make N multiple of 16 to make sure the transform will be fast
+        if N % 16:
+            N = int(np.ceil(N/16)*16)
+    if N <= 0:
+        raise ValueError("N must be positive.")
+
+    Xf = fft(x, N, axis=axis)
+    h = np.zeros(N)
+    if N % 2 == 0:
+        h[0] = h[N // 2] = 1
+        h[1:N // 2] = 2
+    else:
+        h[0] = 1
+        h[1:(N + 1) // 2] = 2
+
+    if len(x.shape) > 1:
+        ind = [np.newaxis] * x.ndim
+        ind[axis] = slice(None)
+        h = h[ind]
+    y = ifft(Xf * h, axis=axis)
+    return y[:x.shape[axis]]
diff --git a/scores/srmr/modulation_filters.py b/scores/srmr/modulation_filters.py
new file mode 100644
index 0000000000000000000000000000000000000000..1317958e790ae0c300b74d8932f64f3b1d6c0397
--- /dev/null
+++ b/scores/srmr/modulation_filters.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+# Copyright 2014 João Felipe Santos, jfsantos@emt.inrs.ca
+#
+# This file is part of the SRMRpy library, and is licensed under the
+# MIT license: https://github.com/jfsantos/SRMRpy/blob/master/LICENSE
+
+from __future__ import division
+import numpy as np
+import scipy.signal as sig
+
+def make_modulation_filter(w0, Q):
+    W0 = np.tan(w0/2)
+    B0 = W0/Q
+    b = np.array([B0, 0, -B0], dtype=np.float)
+    a = np.array([(1 + B0 + W0**2), (2*W0**2 - 2), (1 - B0 + W0**2)], dtype=np.float)
+    return b, a
+
+def modulation_filterbank(mf, fs, Q):
+    return [make_modulation_filter(w0, Q) for w0 in 2*np.pi*mf/fs]
+
+def compute_modulation_cfs(min_cf, max_cf, n):
+    spacing_factor = (max_cf/min_cf)**(1.0/(n-1))
+    cfs = np.zeros(n)
+    cfs[0] = min_cf
+    for k in range(1,n):
+        cfs[k] = cfs[k-1]*spacing_factor
+    return cfs
+
+def modfilt(F, x):
+    y = np.zeros((len(F), len(x)), dtype=np.float)
+    for k, f in enumerate(F):
+        y[k] = sig.lfilter(f[0], f[1], x)
+    return y
+
diff --git a/scores/srmr/segmentaxis.py b/scores/srmr/segmentaxis.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ad5f1c31811e2a64d871091473729bb68702a77
--- /dev/null
+++ b/scores/srmr/segmentaxis.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+# Copyright 2014 João Felipe Santos, jfsantos@emt.inrs.ca
+#
+# This file is part of the SRMRpy library, and is licensed under the
+# MIT license: https://github.com/jfsantos/SRMRpy/blob/master/LICENSE
+
+"""segmentaxis code, originally in scikits.talkbox (https://pypi.python.org/pypi/scikits.talkbox)
+
+This code has been implemented by Anne Archibald, and has been discussed on the
+ML."""
+from __future__  import division
+import numpy as np
+import warnings
+
+def segment_axis(a, length, overlap=0, axis=None, end='cut', endvalue=0):
+    """Generate a new array that chops the given array along the given axis
+    into overlapping frames.
+
+    example:
+    >>> segment_axis(arange(10), 4, 2)
+    array([[0, 1, 2, 3],
+           [2, 3, 4, 5],
+           [4, 5, 6, 7],
+           [6, 7, 8, 9]])
+
+    arguments:
+    a       The array to segment
+    length  The length of each frame
+    overlap The number of array elements by which the frames should overlap
+    axis    The axis to operate on; if None, act on the flattened array
+    end     What to do with the last frame, if the array is not evenly
+            divisible into pieces. Options are:
+
+            'cut'   Simply discard the extra values
+            'wrap'  Copy values from the beginning of the array
+            'pad'   Pad with a constant value
+
+    endvalue    The value to use for end='pad'
+
+    The array is not copied unless necessary (either because it is unevenly
+    strided and being flattened or because end is set to 'pad' or 'wrap').
+    """
+
+    if axis is None:
+        a = np.ravel(a) # may copy
+        axis = 0
+
+    l = a.shape[axis]
+
+    if overlap >= length:
+        raise ValueError("frames cannot overlap by more than 100%")
+    if overlap < 0 or length <= 0:
+        raise ValueError("overlap must be nonnegative and length must "\
+                          "be positive")
+
+    if l < length or (l-length) % (length-overlap):
+        if l>length:
+            roundup = length + (1+(l-length)//(length-overlap))*(length-overlap)
+            rounddown = length + ((l-length)//(length-overlap))*(length-overlap)
+        else:
+            roundup = length
+            rounddown = 0
+        assert rounddown < l < roundup
+        assert roundup == rounddown + (length-overlap) \
+               or (roundup == length and rounddown == 0)
+        a = a.swapaxes(-1,axis)
+
+        if end == 'cut':
+            a = a[..., :rounddown]
+        elif end in ['pad','wrap']: # copying will be necessary
+            s = list(a.shape)
+            s[-1] = roundup
+            b = np.empty(s,dtype=a.dtype)
+            if end in ['pad','wrap']:
+                b[..., :l] = a
+            if end == 'pad':
+                b[..., l:] = endvalue
+            elif end == 'wrap':
+                b[..., l:] = a[..., :roundup-l]
+            a = b
+        elif end == 'delay':
+            s = list(a.shape)
+            l_orig = l
+            l += overlap
+            # if l not divisible by length, pad last frame with zeros
+            if l_orig % (length-overlap):
+                roundup = length + (1+(l-length)//(length-overlap))*(length-overlap)
+            else:
+                roundup = l
+            s[-1] = roundup
+            b = np.empty(s,dtype=a.dtype)
+
+            b[..., :(overlap)] = endvalue
+            b[..., (overlap):(l_orig+overlap)] = a
+            b[..., (l_orig+overlap):] = endvalue
+            a = b
+        else:
+            raise ValueError("end has to be either 'cut', 'pad', 'wrap', or 'delay'.")
+
+        a = a.swapaxes(-1,axis)
+
+
+    l = a.shape[axis]
+    if l == 0:
+        raise ValueError("Not enough data points to segment array in 'cut' mode; "\
+              "try 'pad' or 'wrap'")
+    assert l >= length
+    assert (l-length) % (length-overlap) == 0
+    n = 1 + (l-length) // (length-overlap)
+    s = a.strides[axis]
+    newshape = a.shape[:axis] + (n,length) + a.shape[axis+1:]
+    newstrides = a.strides[:axis] + ((length-overlap)*s,s) + a.strides[axis+1:]
+
+    try:
+        return np.ndarray.__new__(np.ndarray, strides=newstrides,
+                                  shape=newshape, buffer=a, dtype=a.dtype)
+    except TypeError:
+        warnings.warn("Problem with ndarray creation forces copy.")
+        a = a.copy()
+        # Shape doesn't change but strides does
+        newstrides = a.strides[:axis] + ((length-overlap)*s,s) \
+                     + a.strides[axis+1:]
+        return np.ndarray.__new__(np.ndarray, strides=newstrides,
+                                  shape=newshape, buffer=a, dtype=a.dtype)
diff --git a/scores/srmr/srmr.py b/scores/srmr/srmr.py
new file mode 100644
index 0000000000000000000000000000000000000000..053d1bd1d134f688b8d2ade4284dc29439fe2481
--- /dev/null
+++ b/scores/srmr/srmr.py
@@ -0,0 +1,13 @@
+from basis import ScoreBasis
+
+
+class SRMR(ScoreBasis):
+    def __init__(self):
+        super(SRMR, self).__init__(name='SRMR')
+        self.intrusive = True
+        self.score_rate = 16000
+
+    def windowed_scoring(self, audios, score_rate):
+        from scores.srmr.cal_srmr import cal_SRMR
+        return cal_SRMR(audios[0], score_rate, n_cochlear_filters=23,low_freq=125, min_cf=4,max_cf=128, fast=True, norm=False)[0]
+
diff --git a/scores/srmr/vad.py b/scores/srmr/vad.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaf014f81e70f7c76a70fd0f77dbfd5bd0023237
--- /dev/null
+++ b/scores/srmr/vad.py
@@ -0,0 +1,38 @@
+import numpy as np
+from srmrpy.segmentaxis import segment_axis
+
+def simple_energy_vad(x, fs, framelen=0.02, theta_main=30, theta_min=-55):
+    '''Simple energy voice activity detection algorithm based on energy
+    thresholds as described in Tomi Kinnunen and Padmanabhan Rajan, "A
+    practical, self-adaptive voice activity detector for speaker verification
+    with noisy telephone and microphone data", ICASSP 2013, Vancouver (NOTE:
+    this is the benchmark method, not the method proposed by the authors).
+    '''
+    # Split signal in frames
+    framelen = int(framelen * fs)
+    frames = segment_axis(x, length=framelen, overlap=0, end='pad')
+    frames_zero_mean = frames - frames.mean(axis=0)
+    frame_energy = 10*np.log10(1/(framelen-1) * (frames_zero_mean**2).sum(axis=1) + 1e-6)
+    max_energy = max(frame_energy)
+    speech_presence = (frame_energy > max_energy - theta_main) & (frame_energy > theta_min)
+    x_vad = np.zeros_like(x, dtype=bool)
+    for idx, frame in enumerate(frames):
+        if speech_presence[idx]:
+            x_vad[idx*framelen:(idx+1)*framelen] = True
+        else:
+            x_vad[idx*framelen:(idx+1)*framelen] = False
+    return x[x_vad], x_vad
+
+if __name__ == '__main__':
+    import sys
+    from scipy.io.wavfile import read as readwav
+    from matplotlib import pyplot as plt
+
+    fs, s = readwav(sys.argv[1])
+    s  = s.astype('float')/np.iinfo(s.dtype).max
+    s_vad, speech_presence = simple_energy_vad(s, fs)
+
+    plt.plot(s)
+    plt.plot(s_vad - 1, 'g')
+    plt.show()
+
diff --git a/scores/ssnr.py b/scores/ssnr.py
new file mode 100644
index 0000000000000000000000000000000000000000..5418a7f7189e0f2c5aa5d970e69ab04abd4ca2bf
--- /dev/null
+++ b/scores/ssnr.py
@@ -0,0 +1,58 @@
+from basis import ScoreBasis
+import numpy as np
+
+class SSNR(ScoreBasis):
+    def __init__(self):
+        super(SSNR, self).__init__(name='SSNR')
+        self.intrusive = False
+
+    def windowed_scoring(self, audios, score_rate):
+        if len(audios) != 2:
+            raise ValueError('SSNR needs a reference and a test signals.')
+        return cal_SSNR(audios[0], audios[1], score_rate)
+
+def cal_SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
+    # obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py
+    """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
+        This function implements the segmental signal-to-noise ratio
+        as defined in [1, p. 45] (see Equation 2.12).
+    """
+    clean_speech     = ref_wav
+    processed_speech = deg_wav
+    clean_length     = ref_wav.shape[0]
+    processed_length = deg_wav.shape[0]
+    
+    # scale both to have same dynamic range. Remove DC too.
+    clean_speech     -= clean_speech.mean()
+    processed_speech -= processed_speech.mean()
+    processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
+   
+    # global variables
+    winlength = int(np.round(30 * srate / 1000)) # 30 msecs
+    skiprate  = winlength // 4
+    MIN_SNR   = -10
+    MAX_SNR   = 35
+
+    # For each frame, calculate SSNR
+    num_frames    = int(clean_length / skiprate - (winlength/skiprate))
+    start         = 0
+    time          = np.linspace(1, winlength, winlength) / (winlength + 1)
+    window        = 0.5 * (1 - np.cos(2 * np.pi * time))
+    segmental_snr = []
+
+    for frame_count in range(int(num_frames)):
+        # (1) get the frames for the test and ref speech.
+        # Apply Hanning Window
+        clean_frame     = clean_speech[start:start+winlength]
+        processed_frame = processed_speech[start:start+winlength]
+        clean_frame     = clean_frame * window
+        processed_frame = processed_frame * window
+
+        # (2) Compute Segmental SNR
+        signal_energy = np.sum(clean_frame ** 2)
+        noise_energy  = np.sum((clean_frame - processed_frame) ** 2)
+        segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
+        segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
+        segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
+        start += int(skiprate)
+    return sum(segmental_snr) / len(segmental_snr)
diff --git a/scores/stoi.py b/scores/stoi.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8dba9cf5ff41fd282615ef431959cda3f0b71d2
--- /dev/null
+++ b/scores/stoi.py
@@ -0,0 +1,16 @@
+from basis import ScoreBasis
+
+
+class STOI(ScoreBasis):
+    def __init__(self):
+        super(STOI, self).__init__(name='STOI')
+        self.intrusive = False
+        self.mono = True
+
+    def windowed_scoring(self, audios, score_rate):
+        from pystoi.stoi import stoi
+        if len(audios) != 2:
+            raise ValueError('STOI needs a reference and a test signals.')
+
+        return stoi(audios[1], audios[0], score_rate, extended=False)
+
diff --git a/scores/wSDR.py b/scores/wSDR.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4eb7bac3e3626bc135446f54d7281bd4b088fa0
--- /dev/null
+++ b/scores/wSDR.py
@@ -0,0 +1,37 @@
+import torch
+
+
+class WeightedSDR:
+    def __init__(self):
+        self.loss = weighted_signal_distortion_ratio_loss
+
+    def __call__(self, output, bd):
+        return self.loss(output, bd)
+
+
+def dotproduct(y, y_hat):
+    # batch x channel x nsamples
+    return torch.bmm(y.view(y.shape[0], 1, y.shape[-1]), y_hat.view(y_hat.shape[0], y_hat.shape[-1], 1)).reshape(-1)
+
+
+def weighted_signal_distortion_ratio_loss(output, bd):
+    y = bd['y']  # target signal
+    z = bd['z']  # noise signal
+
+    y_hat = output
+    z_hat = bd['x'] - y_hat  # expected noise signal
+
+    # mono channel only...
+    # can i fix this?
+    y_norm = torch.norm(y, dim=-1).squeeze(1)
+    z_norm = torch.norm(z, dim=-1).squeeze(1)
+    y_hat_norm = torch.norm(y_hat, dim=-1).squeeze(1)
+    z_hat_norm = torch.norm(z_hat, dim=-1).squeeze(1)
+
+    def loss_sdr(a, a_hat, a_norm, a_hat_norm):
+        return dotproduct(a, a_hat) / (a_norm * a_hat_norm + 1e-8)
+
+    alpha = y_norm.pow(2) / (y_norm.pow(2) + z_norm.pow(2) + 1e-8)
+    loss_wSDR = -alpha * loss_sdr(y, y_hat, y_norm, y_hat_norm) - (1 - alpha) * loss_sdr(z, z_hat, z_norm, z_hat_norm)
+
+    return loss_wSDR.mean()
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..abe1d804809214db63d24c27f0022f2dd3f13215
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,30 @@
+
+# -*- coding: utf-8 -*-
+
+
+from setuptools import setup, find_packages
+
+setup(
+    name="metrics",
+    version="1.0",
+    packages=find_packages(),
+
+    install_requires=[
+        'numpy<1.24',
+        'scipy',
+        'tqdm',
+        'resampy',
+        'pystoi',
+        'museval',
+        'tensorflow>=2.0.0',
+        'librosa',
+        # This is requred, but srmrpy pull it in,
+	    # and there is a pip3 conflict if we have the following
+	    # line.
+        #'gammatone @ git+https://github.com/detly/gammatone',
+        'pypesq @ git+https://github.com/vBaiCai/python-pesq',
+        'srmrpy @ git+https://github.com/jfsantos/SRMRpy',
+        'pesq @ git+https://github.com/ludlows/python-pesq',
+    ],
+    include_package_data=True
+)
diff --git a/speechscore.py b/speechscore.py
new file mode 100644
index 0000000000000000000000000000000000000000..2745ed6a99d0d14f3ea2dd8cbc60b2c2b0eb84e2
--- /dev/null
+++ b/speechscore.py
@@ -0,0 +1,210 @@
+import os
+import librosa
+import soundfile as sf
+import resampy
+import numpy as np
+from scores.srmr.srmr import SRMR
+from scores.dnsmos.dnsmos import DNSMOS
+from scores.pesq import PESQ
+from scores.nb_pesq import NB_PESQ
+from scores.sisdr import SISDR
+from scores.stoi import STOI
+from scores.fwsegsnr import FWSEGSNR
+from scores.lsd import LSD
+from scores.bsseval import BSSEval
+from scores.snr import SNR
+from scores.ssnr import SSNR
+from scores.llr import LLR
+from scores.csig import CSIG
+from scores.cbak import CBAK
+from scores.covl import COVL
+from scores.mcd import MCD
+
+def compute_mean_results(*results):
+    mean_result = {}
+
+    # Use the first dictionary as a reference for keys
+    for key in results[0]:
+        # If the value is a nested dictionary, recurse
+        if isinstance(results[0][key], dict):
+            nested_results = [d[key] for d in results]
+            mean_result[key] = compute_mean_results(*nested_results)
+        # Otherwise, compute the mean of the values
+        else:
+            mean_result[key] = sum(d[key] for d in results) / len(results)
+
+    return mean_result
+
+class ScoresList:
+    def __init__(self):
+        self.scores = []
+
+    def __add__(self, score):
+        self.scores += [score]
+        return self
+
+    def __str__(self):
+        return 'Scores: ' + ' '.join([x.name for x in self.scores])
+
+    def __call__(self, test_path, reference_path, window=None, score_rate=None, return_mean=False):
+        """
+        window: float
+            the window length in seconds to use for scoring the files.
+        score_rate:
+            the sampling rate specified for scoring the files.
+        """
+        if test_path is None:
+            print(f'Please provide audio path for test_path')
+            return
+        results = {}
+             
+        if os.path.isdir(test_path):
+            audio_list = self.get_audio_list(test_path)
+            if audio_list is None: return
+            for audio_id in audio_list:
+                results_id = {}                
+                if reference_path is not None:
+                    data = self.audio_reader(test_path+'/'+audio_id, reference_path+'/'+audio_id)
+                else:
+                    data = self.audio_reader(test_path+'/'+audio_id, None)
+                for score in self.scores:
+                    result_score = score.scoring(data, window, score_rate)
+                    results_id[score.name] = result_score
+                results[audio_id] = results_id
+        else:            
+            data = self.audio_reader(test_path, reference_path)
+            for score in self.scores:
+                result_score = score.scoring(data, window, score_rate)
+                results[score.name] = result_score
+
+        if return_mean:
+            mean_result = compute_mean_results(*results.values())
+            results['Mean_Score'] = mean_result
+
+        return results
+
+    def get_audio_list(self, path):
+        # Initialize an empty list to store audio file names
+        audio_list = []
+
+        # Find all '.wav' audio files in the given path
+        path_list = librosa.util.find_files(path, ext="wav")
+
+        # If no '.wav' files are found, try to find '.flac' audio files instead
+        if len(path_list) == 0:
+            path_list = librosa.util.find_files(path, ext="flac")
+
+        # If no audio files are found at all, print an error message and return None
+        if len(path_list) == 0:
+            print(f'No audio files found in {path}, scoring ended!')
+            return None
+
+        # Loop through the list of found audio file paths
+        for audio_path in path_list:
+            # Split the file path by '/' and append the last element (the file name) to the audio_list
+            audio_path_s = audio_path.split('/')
+            audio_list.append(audio_path_s[-1])
+
+        # Return the list of audio file names
+        return audio_list
+
+    def audio_reader(self, test_path, reference_path):
+        """loading sound files and making sure they all have the same lengths
+            (zero-padding to the largest). Also works with numpy arrays.
+        """
+        data = {}
+        audios = []
+        maxlen = 0
+        audio_test, rate_test = sf.read(test_path, always_2d=True)
+
+        if audio_test.shape[1] > 1:
+            audio_test = audio_test[..., 0, None]
+
+        rate = rate_test
+        if reference_path is not None:
+            audio_ref, rate_ref = sf.read(reference_path, always_2d=True)
+            if audio_ref.shape[1] > 1:
+                audio_ref = audio_ref[..., 0, None]
+            if rate_test != rate_ref:
+                rate = min(rate_test, rate_ref)
+            if rate_test != rate:
+                audio_test = resampy.resample(audio_test, rate_test, rate, axis=0)
+            if rate_ref != rate:
+                audio_ref = resampy.resample(audio_ref, rate_ref, rate, axis=0)
+            audios += [audio_test]
+            audios += [audio_ref]
+        else:
+            audios += [audio_test]
+
+        maxlen = 0
+        for index, audio in enumerate(audios):
+            maxlen = max(maxlen, audio.shape[0])
+        ##padding
+        for index, audio in enumerate(audios):
+            if audio.shape[0] != maxlen:
+                new = np.zeros((maxlen,))
+                new[:audio.shape[0]] = audio[...,0]
+                audios[index] = new
+            else:
+                audios[index] = audio[...,0]
+        data['audio'] = audios
+        data['rate'] = rate
+        return data
+
+def SpeechScore(scores=''):
+    """ Load the desired scores inside a Metrics object that can then
+    be called to compute all the desired scores.
+
+    Parameters:
+    ----------
+    scores: str or list of str
+        the scores matching any of these will be automatically loaded. this
+        match is relative to the structure of the speechscores package.
+        For instance:
+        * 'absolute' will match all non-instrusive scores
+        * 'absolute.srmr' or 'srmr' will only match SRMR
+        * '' will match all
+
+    Returns:
+    --------
+
+    A ScoresList object, that can be run to get the desired scores
+    """
+
+    score_cls = ScoresList()
+    for score in scores:
+        if score.lower() == 'srmr':
+            score_cls += SRMR()
+        elif score.lower() == 'pesq':
+            score_cls += PESQ()
+        elif score.lower() == 'nb_pesq':
+            score_cls += NB_PESQ()
+        elif score.lower() == 'stoi':
+            score_cls += STOI()
+        elif score.lower() == 'sisdr':
+            score_cls += SISDR()
+        elif score.lower() == 'fwsegsnr':
+            score_cls += FWSEGSNR()
+        elif score.lower() == 'lsd':
+            score_cls += LSD()
+        elif score.lower() == 'bsseval':
+            score_cls += BSSEval()
+        elif score.lower() == 'dnsmos':
+            score_cls += DNSMOS()
+        elif score.lower() == 'snr':
+            score_cls += SNR()
+        elif score.lower() == 'ssnr':
+            score_cls += SSNR()
+        elif score.lower() == 'llr':
+            score_cls += LLR()
+        elif score.lower() == 'csig':
+            score_cls += CSIG()
+        elif score.lower() == 'cbak':
+            score_cls += CBAK()
+        elif score.lower() == 'covl':
+            score_cls += COVL()
+        elif score.lower() == 'mcd':
+            score_cls += MCD()
+        else:
+           print('score is pending implementation...')
+    return score_cls