MusicGen

Running on A10G

App Files Files Community

MusicGen / audiocraft /metrics /visqol.py

reach-vb HF Staff

Stereo demo update (#60)

5325fcc over 1 year ago

raw

history blame contribute delete

9.69 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import csv
	import json
	import logging
	from pathlib import Path
	import tempfile
	import typing as tp
	import subprocess
	import shutil

	import torch
	import torchaudio

	logger = logging.getLogger(__name__)


	class ViSQOL:
	"""ViSQOL wrapper to run ViSQOL from Python using a pre-installed binary.

	To learn more about ViSQOL and how to build ViSQOL binary using bazel, please refer to the
	instructions available in the open source repository: https://github.com/google/visqol

	ViSQOL is capable of running in two modes:

	Audio Mode:
	When running in audio mode, input signals must have a 48kHz sample rate. Input should be resampled to 48kHz.
	Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
	Audio mode uses support vector regression, with the maximum range at ~4.75.

	Speech Mode:
	When running in speech mode, ViSQOL uses a wideband model. It therefore expects input sample rates of 16kHz.
	Input should be resampled to 16kHz.
	As part of the speech mode processing, a root mean square implementation for voice activity detection
	is performed on the reference signal to determine what parts of the signal have voice activity and
	should therefore be included in the comparison. The signal is normalized before performing the voice
	activity detection.
	Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
	Speech mode is scaled to have a maximum MOS of 5.0 to match previous version behavior.

	For more details, check the guidelines: https://github.com/google/visqol#general-guidelines-for-input

	Args:
	visqol_bin (str): Path to the ViSQOL binary.
	mode (str): ViSQOL computation mode, expecting "audio" or "speech".
	model (str): Name of the model to use for similarity to quality model.
	debug (bool): Whether to also get debug metrics from ViSQOL or not.
	"""
	SAMPLE_RATES_MODES = {"audio": 48_000, "speech": 16_000}
	ALLOWED_SAMPLE_RATES = frozenset(SAMPLE_RATES_MODES.values())

	def __init__(self, bin: tp.Union[Path, str], mode: str = "audio",
	model: str = "libsvm_nu_svr_model.txt", debug: bool = False):
	assert bin is not None and Path(bin).exists(), f"Could not find ViSQOL binary in specified path: {bin}"
	self.visqol_bin = str(bin)
	self.visqol_mode = mode
	self.target_sr = self._get_target_sr(self.visqol_mode)
	self.model = model
	self.debug = debug
	assert Path(self.visqol_model).exists(), \
	f"Could not find the specified model in ViSQOL install: {self.visqol_model}"

	def _get_target_sr(self, mode: str) -> int:
	# returns target sampling rate for the corresponding ViSQOL mode.
	if mode not in ViSQOL.SAMPLE_RATES_MODES:
	raise ValueError(
	f"Unsupported mode! Allowed are: {', '.join(ViSQOL.SAMPLE_RATES_MODES.keys())}"
	)
	return ViSQOL.SAMPLE_RATES_MODES[mode]

	def _prepare_files(
	self, ref_sig: torch.Tensor, deg_sig: torch.Tensor, sr: int, target_sr: int, pad_with_silence: bool = False
	):
	# prepare files for ViSQOL evaluation.
	assert target_sr in ViSQOL.ALLOWED_SAMPLE_RATES
	assert len(ref_sig) == len(deg_sig), (
	"Expects same number of ref and degraded inputs",
	f" but ref len {len(ref_sig)} != deg len {len(deg_sig)}"
	)
	# resample audio if needed
	if sr != target_sr:
	transform = torchaudio.transforms.Resample(sr, target_sr)
	pad = int(0.5 * target_sr)
	rs_ref = []
	rs_deg = []
	for i in range(len(ref_sig)):
	rs_ref_i = transform(ref_sig[i])
	rs_deg_i = transform(deg_sig[i])
	if pad_with_silence:
	rs_ref_i = torch.nn.functional.pad(rs_ref_i, (pad, pad), mode='constant', value=0)
	rs_deg_i = torch.nn.functional.pad(rs_deg_i, (pad, pad), mode='constant', value=0)
	rs_ref.append(rs_ref_i)
	rs_deg.append(rs_deg_i)
	ref_sig = torch.stack(rs_ref)
	deg_sig = torch.stack(rs_deg)
	# save audio chunks to tmp dir and create csv
	tmp_dir = Path(tempfile.mkdtemp())
	try:
	tmp_input_csv_path = tmp_dir / "input.csv"
	tmp_results_csv_path = tmp_dir / "results.csv"
	tmp_debug_json_path = tmp_dir / "debug.json"
	with open(tmp_input_csv_path, "w") as csv_file:
	csv_writer = csv.writer(csv_file)
	csv_writer.writerow(["reference", "degraded"])
	for i in range(len(ref_sig)):
	tmp_ref_filename = tmp_dir / f"ref_{i}.wav"
	tmp_deg_filename = tmp_dir / f"deg_{i}.wav"
	torchaudio.save(
	tmp_ref_filename,
	torch.clamp(ref_sig[i], min=-0.99, max=0.99),
	sample_rate=target_sr,
	bits_per_sample=16,
	encoding="PCM_S"
	)
	torchaudio.save(
	tmp_deg_filename,
	torch.clamp(deg_sig[i], min=-0.99, max=0.99),
	sample_rate=target_sr,
	bits_per_sample=16,
	encoding="PCM_S"
	)
	csv_writer.writerow([str(tmp_ref_filename), str(tmp_deg_filename)])
	return tmp_dir, tmp_input_csv_path, tmp_results_csv_path, tmp_debug_json_path
	except Exception as e:
	logger.error("Exception occurred when preparing files for ViSQOL: %s", e)
	return tmp_dir, None, None, None

	def _flush_files(self, tmp_dir: tp.Union[Path, str]):
	# flush tmp files used to compute ViSQOL.
	shutil.rmtree(str(tmp_dir))

	def _collect_moslqo_score(self, results_csv_path: tp.Union[Path, str]) -> float:
	# collect results for each evaluated pair and return averaged moslqo score.
	with open(results_csv_path, "r") as csv_file:
	reader = csv.DictReader(csv_file)
	moslqo_scores = [float(row["moslqo"]) for row in reader]
	if len(moslqo_scores) > 0:
	return sum(moslqo_scores) / len(moslqo_scores)
	else:
	return 0.0

	def _collect_debug_data(self, debug_json_path: tp.Union[Path, str]) -> dict:
	# collect debug data for the visqol inference.
	with open(debug_json_path, "r") as f:
	data = json.load(f)
	return data

	@property
	def visqol_model(self):
	return f'{self.visqol_bin}/model/{self.model}'

	def _run_visqol(
	self,
	input_csv_path: tp.Union[Path, str],
	results_csv_path: tp.Union[Path, str],
	debug_csv_path: tp.Optional[tp.Union[Path, str]],
	):
	input_csv_path = str(input_csv_path)
	results_csv_path = str(results_csv_path)
	debug_csv_path = str(debug_csv_path)
	cmd = [
	f'{self.visqol_bin}/bazel-bin/visqol',
	'--batch_input_csv', f'{input_csv_path}',
	'--results_csv', f'{results_csv_path}'
	]
	if debug_csv_path is not None:
	cmd += ['--output_debug', f'{debug_csv_path}']
	if self.visqol_mode == "speech":
	cmd += ['--use_speech_mode']
	cmd += ['--similarity_to_quality_model', f'{self.visqol_model}']
	result = subprocess.run(cmd, capture_output=True)
	if result.returncode:
	logger.error("Error with visqol: \n %s \n %s", result.stdout.decode(), result.stderr.decode())
	raise RuntimeError("Error while executing visqol")
	result.check_returncode()

	def __call__(
	self,
	ref_sig: torch.Tensor,
	deg_sig: torch.Tensor,
	sr: int,
	pad_with_silence: bool = False,
	):
	"""Calculate the ViSQOL metric for a pair of audio signals at a given sample rate.
	Args:
	ref_sig (torch.Tensor): Reference signals as [B, C, T].
	deg_sig (torch.Tensor): Degraded signals as [B, C, T].
	sr (int): Sample rate of the two audio signals.
	pad_with_silence (bool): Whether to pad the file with silences as recommended
	in visqol guidelines (see: https://github.com/google/visqol#general-guidelines-for-input).
	Returns:
	float: The ViSQOL score or mean score for the batch.
	"""
	logger.debug(f"Calculating visqol with mode={self.visqol_mode} on {len(ref_sig)} samples")
	tmp_dir, input_csv, results_csv, debug_json = self._prepare_files(
	ref_sig, deg_sig, sr, self.target_sr, pad_with_silence
	)
	try:
	if input_csv and results_csv:
	self._run_visqol(
	input_csv,
	results_csv,
	debug_json if self.debug else None,
	)
	mosqol = self._collect_moslqo_score(results_csv)
	return mosqol
	else:
	raise RuntimeError("Something unexpected happened when running VISQOL!")
	except Exception as e:
	logger.error("Exception occurred when running ViSQOL: %s", e)
	finally:
	self._flush_files(tmp_dir)