from basis import ScoreBasis import librosa import math import numpy as np import pyworld import pysptk from fastdtw import fastdtw from scipy.spatial.distance import euclidean #refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py class MCD(ScoreBasis): def __init__(self): super(MCD, self).__init__(name='MCD') self.intrusive = False # three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics self.mcd_toolbox = Calculate_MCD(MCD_mode="plain") def windowed_scoring(self, audios, score_rate): if len(audios) != 2: return None return self.mcd_toolbox.calculate_mcd(audios[1], audios[0], score_rate) # ================================================= # # calculate the Mel-Cepstral Distortion (MCD) value # # ================================================= # #refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py class Calculate_MCD(object): """docstring for Calculate_MCD""" def __init__(self, MCD_mode): super(Calculate_MCD, self).__init__() self.MCD_mode = MCD_mode #self.SAMPLING_RATE = 22050 self.FRAME_PERIOD = 5.0 self.log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) # 6.141851463713754 def load_wav(self, wav_file, sample_rate): """ Load a wav file with librosa. :param wav_file: path to wav file :param sr: sampling rate :return: audio time series numpy array """ wav, _ = librosa.load(wav_file, sr=sample_rate, mono=True) return wav # distance metric def log_spec_dB_dist(self, x, y): # log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) diff = x - y return self.log_spec_dB_const * math.sqrt(np.inner(diff, diff)) # calculate distance (metric) # def calculate_mcd_distance(self, x, y, distance, path): def calculate_mcd_distance(self, x, y, path): ''' param path: pairs between x and y ''' pathx = list(map(lambda l: l[0], path)) pathy = list(map(lambda l: l[1], path)) x, y = x[pathx], y[pathy] frames_tot = x.shape[0] # length of pairs z = x - y min_cost_tot = np.sqrt((z * z).sum(-1)).sum() return frames_tot, min_cost_tot # extract acoustic features # alpha = 0.65 # commonly used at 22050 Hz def wav2mcep_numpy(self, loaded_wav, score_rate=22050, alpha=0.65, fft_size=512): # Use WORLD vocoder to spectral envelope _, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=score_rate, frame_period=self.FRAME_PERIOD, fft_size=fft_size) # Extract MCEP features mcep = pysptk.sptk.mcep(sp, order=13, alpha=alpha, maxiter=0, etype=1, eps=1.0E-8, min_det=0.0, itype=3) return mcep # calculate the Mel-Cepstral Distortion (MCD) value #def average_mcd(self, ref_audio_file, syn_audio_file, cost_function, MCD_mode): def average_mcd(self, loaded_ref_wav, loaded_syn_wav, cost_function, MCD_mode, score_rate): """ Calculate the average MCD. :param ref_mcep_files: list of strings, paths to MCEP target reference files :param synth_mcep_files: list of strings, paths to MCEP converted synthesised files :param cost_function: distance metric used :param plain: if plain=True, use Dynamic Time Warping (dtw) :returns: average MCD, total frames processed """ # load wav from given wav file #loaded_ref_wav = self.load_wav(ref_audio_file, sample_rate=self.SAMPLING_RATE) #loaded_syn_wav = self.load_wav(syn_audio_file, sample_rate=self.SAMPLING_RATE) if MCD_mode == "plain": # pad 0 if len(loaded_ref_wav)len(syn_mcep_vec) else len(syn_mcep_vec)/len(ref_mcep_vec) _, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean) frames_tot, min_cost_tot = self.calculate_mcd_distance(ref_mcep_vec, syn_mcep_vec, path) if MCD_mode == "dtw_sl": mean_mcd = cof * self.log_spec_dB_const * min_cost_tot / frames_tot else: mean_mcd = self.log_spec_dB_const * min_cost_tot / frames_tot return mean_mcd # calculate mcd def calculate_mcd(self, reference_audio, synthesized_audio, score_rate): # extract acoustic features mean_mcd = self.average_mcd(reference_audio, synthesized_audio, self.log_spec_dB_dist, self.MCD_mode, score_rate) return mean_mcd