Spaces:

amphion
/

maskgct

Running on Zero

App Files Files Community

maskgct / preprocessors /Emilia /models /dnsmos.py

Hecheng0625

Upload 409 files

c968fc3 verified 14 days ago

raw

history blame contribute delete

6.2 kB

	# Source: https://github.com/microsoft/DNS-Challenge/tree/master/DNSMOS
	#
	# Copyright (c) 2022 Microsoft
	#
	# This code is licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0) license.
	# The full license text is available at the root of the source repository.
	#
	# Note: This code has been modified to fit the context of this repository.
	# This code is included in an MIT-licensed repository.
	# The repository's MIT license does not apply to this code.

	import os
	import librosa
	import numpy as np
	import onnxruntime as ort
	import pandas as pd
	import tqdm
	import warnings


	warnings.filterwarnings("ignore")

	SAMPLING_RATE = 16000
	INPUT_LENGTH = 9.01


	class ComputeScore:
	"""
	ComputeScore class for evaluating DNSMOS.
	"""

	def __init__(self, primary_model_path, device="cpu") -> None:
	"""
	Initialize the ComputeScore object.

	Args:
	primary_model_path (str): Path to the primary model.
	device (str): Device to run the models on ('cpu' or 'cuda').

	Returns:
	None

	Raises:
	RuntimeError: If the device is not supported.
	"""
	if device == "cuda":
	self.onnx_sess = ort.InferenceSession(
	primary_model_path, providers=["CUDAExecutionProvider"]
	)
	print("Using CUDA:", self.onnx_sess.get_providers())
	else:
	self.onnx_sess = ort.InferenceSession(primary_model_path)

	def audio_melspec(
	self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True
	):
	"""
	Compute the mel spectrogram of an audio signal.

	Args:
	audio (np.ndarray): Input audio signal.
	n_mels (int): Number of mel bands.
	frame_size (int): Size of the FFT window.
	hop_length (int): Number of samples between successive frames.
	sr (int): Sampling rate.
	to_db (bool): Whether to convert the power spectrogram to decibel units.

	Returns:
	np.ndarray: Mel spectrogram.
	"""
	mel_spec = librosa.feature.melspectrogram(
	y=audio, sr=sr, n_fft=frame_size + 1, hop_length=hop_length, n_mels=n_mels
	)
	if to_db:
	mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40
	return mel_spec.T

	def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS):
	"""
	Apply polynomial fitting to MOS scores.

	Args:
	sig (float): Signal MOS score.
	bak (float): Background MOS score.
	ovr (float): Overall MOS score.
	is_personalized_MOS (bool): Flag for personalized MOS.

	Returns:
	tuple: Tuple containing the adjusted signal, background, and overall MOS scores.
	"""
	if is_personalized_MOS:
	p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046])
	p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726])
	p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132])
	else:
	p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
	p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439])
	p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])

	sig_poly = p_sig(sig)
	bak_poly = p_bak(bak)
	ovr_poly = p_ovr(ovr)

	return sig_poly, bak_poly, ovr_poly

	def __call__(self, audio, sampling_rate, is_personalized_MOS):
	"""
	Compute DNSMOS scores for an audio signal.

	Args:
	audio (np.ndarray or str): Input audio signal or path to audio file.
	sampling_rate (int): Sampling rate of the input audio.
	is_personalized_MOS (bool): Flag for personalized MOS.

	Returns:
	dict: Dictionary containing MOS scores.

	Raises:
	ValueError: If the input audio is not valid.
	"""
	fs = SAMPLING_RATE
	if isinstance(audio, str):
	audio, _ = librosa.load(audio, sr=fs)
	elif sampling_rate != fs:
	# resample audio
	audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=fs)

	actual_audio_len = len(audio)

	len_samples = int(INPUT_LENGTH * fs)
	while len(audio) < len_samples:
	audio = np.append(audio, audio)

	num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1
	hop_len_samples = fs
	predicted_mos_sig_seg_raw = []
	predicted_mos_bak_seg_raw = []
	predicted_mos_ovr_seg_raw = []
	predicted_mos_sig_seg = []
	predicted_mos_bak_seg = []
	predicted_mos_ovr_seg = []

	for idx in range(num_hops):
	audio_seg = audio[
	int(idx * hop_len_samples) : int((idx + INPUT_LENGTH) * hop_len_samples)
	]
	if len(audio_seg) < len_samples:
	continue
	input_features = np.array(audio_seg).astype("float32")[np.newaxis, :]
	oi = {"input_1": input_features}
	mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
	mos_sig, mos_bak, mos_ovr = self.get_polyfit_val(
	mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS
	)
	predicted_mos_sig_seg_raw.append(mos_sig_raw)
	predicted_mos_bak_seg_raw.append(mos_bak_raw)
	predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
	predicted_mos_sig_seg.append(mos_sig)
	predicted_mos_bak_seg.append(mos_bak)
	predicted_mos_ovr_seg.append(mos_ovr)

	clip_dict = {
	"filename": "audio_clip",
	"len_in_sec": actual_audio_len / fs,
	"sr": fs,
	"num_hops": num_hops,
	"OVRL_raw": np.mean(predicted_mos_ovr_seg_raw),
	"SIG_raw": np.mean(predicted_mos_sig_seg_raw),
	"BAK_raw": np.mean(predicted_mos_bak_seg_raw),
	"OVRL": np.mean(predicted_mos_ovr_seg),
	"SIG": np.mean(predicted_mos_sig_seg),
	"BAK": np.mean(predicted_mos_bak_seg),
	}
	return clip_dict