import os import librosa import numpy as np import numpy.polynomial.polynomial as poly import onnxruntime as ort import soundfile as sf SAMPLING_RATE = 16000 INPUT_LENGTH = 9.01 from basis import ScoreBasis class DNSMOS(ScoreBasis): def __init__(self): super(DNSMOS, self).__init__(name='DNSMOS') self.intrusive = True self.score_rate = 16000 self.p808_model_path = os.path.join('scores/dnsmos/DNSMOS', 'model_v8.onnx') self.primary_model_path = os.path.join('scores/dnsmos/DNSMOS', 'sig_bak_ovr.onnx') self.compute_score = ComputeScore(self.primary_model_path, self.p808_model_path) def windowed_scoring(self, audios, rate): if len(audios) == 2: return self.compute_score.cal_mos(audios[1], rate) else: return self.compute_score.cal_mos(audios[0], rate) class ComputeScore: def __init__(self, primary_model_path, p808_model_path) -> None: self.onnx_sess = ort.InferenceSession(primary_model_path) self.p808_onnx_sess = ort.InferenceSession(p808_model_path) def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True): mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels) if to_db: mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40 return mel_spec.T def get_polyfit_val(self, sig, bak, ovr): p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535]) p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439 ]) p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546]) sig_poly = p_sig(sig) bak_poly = p_bak(bak) ovr_poly = p_ovr(ovr) return sig_poly, bak_poly, ovr_poly def cal_mos(self, audio, sampling_rate): fs = sampling_rate actual_audio_len = len(audio) len_samples = int(INPUT_LENGTH*fs) while len(audio) < len_samples: audio = np.append(audio, audio) num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1 hop_len_samples = fs predicted_mos_sig_seg_raw = [] predicted_mos_bak_seg_raw = [] predicted_mos_ovr_seg_raw = [] predicted_mos_sig_seg = [] predicted_mos_bak_seg = [] predicted_mos_ovr_seg = [] predicted_p808_mos = [] for idx in range(num_hops): audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)] if len(audio_seg) < len_samples: continue input_features = np.array(audio_seg).astype('float32')[np.newaxis,:] p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :] oi = {'input_1': input_features} p808_oi = {'input_1': p808_input_features} p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0] mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0] mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw,mos_bak_raw,mos_ovr_raw) predicted_mos_sig_seg_raw.append(mos_sig_raw) predicted_mos_bak_seg_raw.append(mos_bak_raw) predicted_mos_ovr_seg_raw.append(mos_ovr_raw) predicted_mos_sig_seg.append(mos_sig) predicted_mos_bak_seg.append(mos_bak) predicted_mos_ovr_seg.append(mos_ovr) predicted_p808_mos.append(p808_mos) results = {} results['OVRL'] = np.mean(predicted_mos_ovr_seg) results['SIG'] = np.mean(predicted_mos_sig_seg) results['BAK'] = np.mean(predicted_mos_bak_seg) results['P808_MOS'] = np.mean(predicted_p808_mos) return results