File size: 2,411 Bytes
936f6fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from basis import ScoreBasis
import numpy as np

class SSNR(ScoreBasis):
    def __init__(self):
        super(SSNR, self).__init__(name='SSNR')
        self.intrusive = False

    def windowed_scoring(self, audios, score_rate):
        if len(audios) != 2:
            raise ValueError('SSNR needs a reference and a test signals.')
        return cal_SSNR(audios[0], audios[1], score_rate)

def cal_SSNR(ref_wav, deg_wav, srate=16000, eps=1e-10):
    # obtained from https://github.com/wooseok-shin/MetricGAN-plus-pytorch/blob/main/metric_functions/metric_helper.py
    """ Segmental Signal-to-Noise Ratio Objective Speech Quality Measure
        This function implements the segmental signal-to-noise ratio
        as defined in [1, p. 45] (see Equation 2.12).
    """
    clean_speech     = ref_wav
    processed_speech = deg_wav
    clean_length     = ref_wav.shape[0]
    processed_length = deg_wav.shape[0]
    
    # scale both to have same dynamic range. Remove DC too.
    clean_speech     -= clean_speech.mean()
    processed_speech -= processed_speech.mean()
    processed_speech *= (np.max(np.abs(clean_speech)) / np.max(np.abs(processed_speech)))
   
    # global variables
    winlength = int(np.round(30 * srate / 1000)) # 30 msecs
    skiprate  = winlength // 4
    MIN_SNR   = -10
    MAX_SNR   = 35

    # For each frame, calculate SSNR
    num_frames    = int(clean_length / skiprate - (winlength/skiprate))
    start         = 0
    time          = np.linspace(1, winlength, winlength) / (winlength + 1)
    window        = 0.5 * (1 - np.cos(2 * np.pi * time))
    segmental_snr = []

    for frame_count in range(int(num_frames)):
        # (1) get the frames for the test and ref speech.
        # Apply Hanning Window
        clean_frame     = clean_speech[start:start+winlength]
        processed_frame = processed_speech[start:start+winlength]
        clean_frame     = clean_frame * window
        processed_frame = processed_frame * window

        # (2) Compute Segmental SNR
        signal_energy = np.sum(clean_frame ** 2)
        noise_energy  = np.sum((clean_frame - processed_frame) ** 2)
        segmental_snr.append(10 * np.log10(signal_energy / (noise_energy + eps)+ eps))
        segmental_snr[-1] = max(segmental_snr[-1], MIN_SNR)
        segmental_snr[-1] = min(segmental_snr[-1], MAX_SNR)
        start += int(skiprate)
    return sum(segmental_snr) / len(segmental_snr)