Spaces:
Running
Running
File size: 3,822 Bytes
936f6fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
import librosa
import numpy as np
import numpy.polynomial.polynomial as poly
import onnxruntime as ort
import soundfile as sf
SAMPLING_RATE = 16000
INPUT_LENGTH = 9.01
from basis import ScoreBasis
class DNSMOS(ScoreBasis):
def __init__(self):
super(DNSMOS, self).__init__(name='DNSMOS')
self.intrusive = True
self.score_rate = 16000
self.p808_model_path = os.path.join('scores/dnsmos/DNSMOS', 'model_v8.onnx')
self.primary_model_path = os.path.join('scores/dnsmos/DNSMOS', 'sig_bak_ovr.onnx')
self.compute_score = ComputeScore(self.primary_model_path, self.p808_model_path)
def windowed_scoring(self, audios, rate):
if len(audios) == 2:
return self.compute_score.cal_mos(audios[1], rate)
else:
return self.compute_score.cal_mos(audios[0], rate)
class ComputeScore:
def __init__(self, primary_model_path, p808_model_path) -> None:
self.onnx_sess = ort.InferenceSession(primary_model_path)
self.p808_onnx_sess = ort.InferenceSession(p808_model_path)
def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True):
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels)
if to_db:
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40
return mel_spec.T
def get_polyfit_val(self, sig, bak, ovr):
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439 ])
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
sig_poly = p_sig(sig)
bak_poly = p_bak(bak)
ovr_poly = p_ovr(ovr)
return sig_poly, bak_poly, ovr_poly
def cal_mos(self, audio, sampling_rate):
fs = sampling_rate
actual_audio_len = len(audio)
len_samples = int(INPUT_LENGTH*fs)
while len(audio) < len_samples:
audio = np.append(audio, audio)
num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1
hop_len_samples = fs
predicted_mos_sig_seg_raw = []
predicted_mos_bak_seg_raw = []
predicted_mos_ovr_seg_raw = []
predicted_mos_sig_seg = []
predicted_mos_bak_seg = []
predicted_mos_ovr_seg = []
predicted_p808_mos = []
for idx in range(num_hops):
audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)]
if len(audio_seg) < len_samples:
continue
input_features = np.array(audio_seg).astype('float32')[np.newaxis,:]
p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :]
oi = {'input_1': input_features}
p808_oi = {'input_1': p808_input_features}
p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw,mos_bak_raw,mos_ovr_raw)
predicted_mos_sig_seg_raw.append(mos_sig_raw)
predicted_mos_bak_seg_raw.append(mos_bak_raw)
predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
predicted_mos_sig_seg.append(mos_sig)
predicted_mos_bak_seg.append(mos_bak)
predicted_mos_ovr_seg.append(mos_ovr)
predicted_p808_mos.append(p808_mos)
results = {}
results['OVRL'] = np.mean(predicted_mos_ovr_seg)
results['SIG'] = np.mean(predicted_mos_sig_seg)
results['BAK'] = np.mean(predicted_mos_bak_seg)
results['P808_MOS'] = np.mean(predicted_p808_mos)
return results
|