Spaces:
Running
Running
import os | |
import librosa | |
import numpy as np | |
import numpy.polynomial.polynomial as poly | |
import onnxruntime as ort | |
import soundfile as sf | |
SAMPLING_RATE = 16000 | |
INPUT_LENGTH = 9.01 | |
from basis import ScoreBasis | |
class DNSMOS(ScoreBasis): | |
def __init__(self): | |
super(DNSMOS, self).__init__(name='DNSMOS') | |
self.intrusive = True | |
self.score_rate = 16000 | |
self.p808_model_path = os.path.join('scores/dnsmos/DNSMOS', 'model_v8.onnx') | |
self.primary_model_path = os.path.join('scores/dnsmos/DNSMOS', 'sig_bak_ovr.onnx') | |
self.compute_score = ComputeScore(self.primary_model_path, self.p808_model_path) | |
def windowed_scoring(self, audios, rate): | |
if len(audios) == 2: | |
return self.compute_score.cal_mos(audios[1], rate) | |
else: | |
return self.compute_score.cal_mos(audios[0], rate) | |
class ComputeScore: | |
def __init__(self, primary_model_path, p808_model_path) -> None: | |
self.onnx_sess = ort.InferenceSession(primary_model_path) | |
self.p808_onnx_sess = ort.InferenceSession(p808_model_path) | |
def audio_melspec(self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True): | |
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=frame_size+1, hop_length=hop_length, n_mels=n_mels) | |
if to_db: | |
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max)+40)/40 | |
return mel_spec.T | |
def get_polyfit_val(self, sig, bak, ovr): | |
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535]) | |
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439 ]) | |
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546]) | |
sig_poly = p_sig(sig) | |
bak_poly = p_bak(bak) | |
ovr_poly = p_ovr(ovr) | |
return sig_poly, bak_poly, ovr_poly | |
def cal_mos(self, audio, sampling_rate): | |
fs = sampling_rate | |
actual_audio_len = len(audio) | |
len_samples = int(INPUT_LENGTH*fs) | |
while len(audio) < len_samples: | |
audio = np.append(audio, audio) | |
num_hops = int(np.floor(len(audio)/fs) - INPUT_LENGTH)+1 | |
hop_len_samples = fs | |
predicted_mos_sig_seg_raw = [] | |
predicted_mos_bak_seg_raw = [] | |
predicted_mos_ovr_seg_raw = [] | |
predicted_mos_sig_seg = [] | |
predicted_mos_bak_seg = [] | |
predicted_mos_ovr_seg = [] | |
predicted_p808_mos = [] | |
for idx in range(num_hops): | |
audio_seg = audio[int(idx*hop_len_samples) : int((idx+INPUT_LENGTH)*hop_len_samples)] | |
if len(audio_seg) < len_samples: | |
continue | |
input_features = np.array(audio_seg).astype('float32')[np.newaxis,:] | |
p808_input_features = np.array(self.audio_melspec(audio=audio_seg[:-160])).astype('float32')[np.newaxis, :, :] | |
oi = {'input_1': input_features} | |
p808_oi = {'input_1': p808_input_features} | |
p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0] | |
mos_sig_raw,mos_bak_raw,mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0] | |
mos_sig,mos_bak,mos_ovr = self.get_polyfit_val(mos_sig_raw,mos_bak_raw,mos_ovr_raw) | |
predicted_mos_sig_seg_raw.append(mos_sig_raw) | |
predicted_mos_bak_seg_raw.append(mos_bak_raw) | |
predicted_mos_ovr_seg_raw.append(mos_ovr_raw) | |
predicted_mos_sig_seg.append(mos_sig) | |
predicted_mos_bak_seg.append(mos_bak) | |
predicted_mos_ovr_seg.append(mos_ovr) | |
predicted_p808_mos.append(p808_mos) | |
results = {} | |
results['OVRL'] = np.mean(predicted_mos_ovr_seg) | |
results['SIG'] = np.mean(predicted_mos_sig_seg) | |
results['BAK'] = np.mean(predicted_mos_bak_seg) | |
results['P808_MOS'] = np.mean(predicted_p808_mos) | |
return results | |