Spaces:
Running
on
Zero
Running
on
Zero
# Source: https://github.com/microsoft/DNS-Challenge/tree/master/DNSMOS | |
# | |
# Copyright (c) 2022 Microsoft | |
# | |
# This code is licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0) license. | |
# The full license text is available at the root of the source repository. | |
# | |
# Note: This code has been modified to fit the context of this repository. | |
# This code is included in an MIT-licensed repository. | |
# The repository's MIT license does not apply to this code. | |
import os | |
import librosa | |
import numpy as np | |
import onnxruntime as ort | |
import pandas as pd | |
import tqdm | |
import warnings | |
warnings.filterwarnings("ignore") | |
SAMPLING_RATE = 16000 | |
INPUT_LENGTH = 9.01 | |
class ComputeScore: | |
""" | |
ComputeScore class for evaluating DNSMOS. | |
""" | |
def __init__(self, primary_model_path, device="cpu") -> None: | |
""" | |
Initialize the ComputeScore object. | |
Args: | |
primary_model_path (str): Path to the primary model. | |
device (str): Device to run the models on ('cpu' or 'cuda'). | |
Returns: | |
None | |
Raises: | |
RuntimeError: If the device is not supported. | |
""" | |
if device == "cuda": | |
self.onnx_sess = ort.InferenceSession( | |
primary_model_path, providers=["CUDAExecutionProvider"] | |
) | |
print("Using CUDA:", self.onnx_sess.get_providers()) | |
else: | |
self.onnx_sess = ort.InferenceSession(primary_model_path) | |
def audio_melspec( | |
self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True | |
): | |
""" | |
Compute the mel spectrogram of an audio signal. | |
Args: | |
audio (np.ndarray): Input audio signal. | |
n_mels (int): Number of mel bands. | |
frame_size (int): Size of the FFT window. | |
hop_length (int): Number of samples between successive frames. | |
sr (int): Sampling rate. | |
to_db (bool): Whether to convert the power spectrogram to decibel units. | |
Returns: | |
np.ndarray: Mel spectrogram. | |
""" | |
mel_spec = librosa.feature.melspectrogram( | |
y=audio, sr=sr, n_fft=frame_size + 1, hop_length=hop_length, n_mels=n_mels | |
) | |
if to_db: | |
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40 | |
return mel_spec.T | |
def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS): | |
""" | |
Apply polynomial fitting to MOS scores. | |
Args: | |
sig (float): Signal MOS score. | |
bak (float): Background MOS score. | |
ovr (float): Overall MOS score. | |
is_personalized_MOS (bool): Flag for personalized MOS. | |
Returns: | |
tuple: Tuple containing the adjusted signal, background, and overall MOS scores. | |
""" | |
if is_personalized_MOS: | |
p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046]) | |
p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726]) | |
p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132]) | |
else: | |
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535]) | |
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439]) | |
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546]) | |
sig_poly = p_sig(sig) | |
bak_poly = p_bak(bak) | |
ovr_poly = p_ovr(ovr) | |
return sig_poly, bak_poly, ovr_poly | |
def __call__(self, audio, sampling_rate, is_personalized_MOS): | |
""" | |
Compute DNSMOS scores for an audio signal. | |
Args: | |
audio (np.ndarray or str): Input audio signal or path to audio file. | |
sampling_rate (int): Sampling rate of the input audio. | |
is_personalized_MOS (bool): Flag for personalized MOS. | |
Returns: | |
dict: Dictionary containing MOS scores. | |
Raises: | |
ValueError: If the input audio is not valid. | |
""" | |
fs = SAMPLING_RATE | |
if isinstance(audio, str): | |
audio, _ = librosa.load(audio, sr=fs) | |
elif sampling_rate != fs: | |
# resample audio | |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=fs) | |
actual_audio_len = len(audio) | |
len_samples = int(INPUT_LENGTH * fs) | |
while len(audio) < len_samples: | |
audio = np.append(audio, audio) | |
num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1 | |
hop_len_samples = fs | |
predicted_mos_sig_seg_raw = [] | |
predicted_mos_bak_seg_raw = [] | |
predicted_mos_ovr_seg_raw = [] | |
predicted_mos_sig_seg = [] | |
predicted_mos_bak_seg = [] | |
predicted_mos_ovr_seg = [] | |
for idx in range(num_hops): | |
audio_seg = audio[ | |
int(idx * hop_len_samples) : int((idx + INPUT_LENGTH) * hop_len_samples) | |
] | |
if len(audio_seg) < len_samples: | |
continue | |
input_features = np.array(audio_seg).astype("float32")[np.newaxis, :] | |
oi = {"input_1": input_features} | |
mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0] | |
mos_sig, mos_bak, mos_ovr = self.get_polyfit_val( | |
mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS | |
) | |
predicted_mos_sig_seg_raw.append(mos_sig_raw) | |
predicted_mos_bak_seg_raw.append(mos_bak_raw) | |
predicted_mos_ovr_seg_raw.append(mos_ovr_raw) | |
predicted_mos_sig_seg.append(mos_sig) | |
predicted_mos_bak_seg.append(mos_bak) | |
predicted_mos_ovr_seg.append(mos_ovr) | |
clip_dict = { | |
"filename": "audio_clip", | |
"len_in_sec": actual_audio_len / fs, | |
"sr": fs, | |
"num_hops": num_hops, | |
"OVRL_raw": np.mean(predicted_mos_ovr_seg_raw), | |
"SIG_raw": np.mean(predicted_mos_sig_seg_raw), | |
"BAK_raw": np.mean(predicted_mos_bak_seg_raw), | |
"OVRL": np.mean(predicted_mos_ovr_seg), | |
"SIG": np.mean(predicted_mos_sig_seg), | |
"BAK": np.mean(predicted_mos_bak_seg), | |
} | |
return clip_dict | |