reach-vb's picture
reach-vb HF staff
Stereo demo update (#60)
5325fcc
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import csv
import json
import logging
from pathlib import Path
import tempfile
import typing as tp
import subprocess
import shutil
import torch
import torchaudio
logger = logging.getLogger(__name__)
class ViSQOL:
"""ViSQOL wrapper to run ViSQOL from Python using a pre-installed binary.
To learn more about ViSQOL and how to build ViSQOL binary using bazel, please refer to the
instructions available in the open source repository: https://github.com/google/visqol
ViSQOL is capable of running in two modes:
Audio Mode:
When running in audio mode, input signals must have a 48kHz sample rate. Input should be resampled to 48kHz.
Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
Audio mode uses support vector regression, with the maximum range at ~4.75.
Speech Mode:
When running in speech mode, ViSQOL uses a wideband model. It therefore expects input sample rates of 16kHz.
Input should be resampled to 16kHz.
As part of the speech mode processing, a root mean square implementation for voice activity detection
is performed on the reference signal to determine what parts of the signal have voice activity and
should therefore be included in the comparison. The signal is normalized before performing the voice
activity detection.
Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
Speech mode is scaled to have a maximum MOS of 5.0 to match previous version behavior.
For more details, check the guidelines: https://github.com/google/visqol#general-guidelines-for-input
Args:
visqol_bin (str): Path to the ViSQOL binary.
mode (str): ViSQOL computation mode, expecting "audio" or "speech".
model (str): Name of the model to use for similarity to quality model.
debug (bool): Whether to also get debug metrics from ViSQOL or not.
"""
SAMPLE_RATES_MODES = {"audio": 48_000, "speech": 16_000}
ALLOWED_SAMPLE_RATES = frozenset(SAMPLE_RATES_MODES.values())
def __init__(self, bin: tp.Union[Path, str], mode: str = "audio",
model: str = "libsvm_nu_svr_model.txt", debug: bool = False):
assert bin is not None and Path(bin).exists(), f"Could not find ViSQOL binary in specified path: {bin}"
self.visqol_bin = str(bin)
self.visqol_mode = mode
self.target_sr = self._get_target_sr(self.visqol_mode)
self.model = model
self.debug = debug
assert Path(self.visqol_model).exists(), \
f"Could not find the specified model in ViSQOL install: {self.visqol_model}"
def _get_target_sr(self, mode: str) -> int:
# returns target sampling rate for the corresponding ViSQOL mode.
if mode not in ViSQOL.SAMPLE_RATES_MODES:
raise ValueError(
f"Unsupported mode! Allowed are: {', '.join(ViSQOL.SAMPLE_RATES_MODES.keys())}"
)
return ViSQOL.SAMPLE_RATES_MODES[mode]
def _prepare_files(
self, ref_sig: torch.Tensor, deg_sig: torch.Tensor, sr: int, target_sr: int, pad_with_silence: bool = False
):
# prepare files for ViSQOL evaluation.
assert target_sr in ViSQOL.ALLOWED_SAMPLE_RATES
assert len(ref_sig) == len(deg_sig), (
"Expects same number of ref and degraded inputs",
f" but ref len {len(ref_sig)} != deg len {len(deg_sig)}"
)
# resample audio if needed
if sr != target_sr:
transform = torchaudio.transforms.Resample(sr, target_sr)
pad = int(0.5 * target_sr)
rs_ref = []
rs_deg = []
for i in range(len(ref_sig)):
rs_ref_i = transform(ref_sig[i])
rs_deg_i = transform(deg_sig[i])
if pad_with_silence:
rs_ref_i = torch.nn.functional.pad(rs_ref_i, (pad, pad), mode='constant', value=0)
rs_deg_i = torch.nn.functional.pad(rs_deg_i, (pad, pad), mode='constant', value=0)
rs_ref.append(rs_ref_i)
rs_deg.append(rs_deg_i)
ref_sig = torch.stack(rs_ref)
deg_sig = torch.stack(rs_deg)
# save audio chunks to tmp dir and create csv
tmp_dir = Path(tempfile.mkdtemp())
try:
tmp_input_csv_path = tmp_dir / "input.csv"
tmp_results_csv_path = tmp_dir / "results.csv"
tmp_debug_json_path = tmp_dir / "debug.json"
with open(tmp_input_csv_path, "w") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["reference", "degraded"])
for i in range(len(ref_sig)):
tmp_ref_filename = tmp_dir / f"ref_{i}.wav"
tmp_deg_filename = tmp_dir / f"deg_{i}.wav"
torchaudio.save(
tmp_ref_filename,
torch.clamp(ref_sig[i], min=-0.99, max=0.99),
sample_rate=target_sr,
bits_per_sample=16,
encoding="PCM_S"
)
torchaudio.save(
tmp_deg_filename,
torch.clamp(deg_sig[i], min=-0.99, max=0.99),
sample_rate=target_sr,
bits_per_sample=16,
encoding="PCM_S"
)
csv_writer.writerow([str(tmp_ref_filename), str(tmp_deg_filename)])
return tmp_dir, tmp_input_csv_path, tmp_results_csv_path, tmp_debug_json_path
except Exception as e:
logger.error("Exception occurred when preparing files for ViSQOL: %s", e)
return tmp_dir, None, None, None
def _flush_files(self, tmp_dir: tp.Union[Path, str]):
# flush tmp files used to compute ViSQOL.
shutil.rmtree(str(tmp_dir))
def _collect_moslqo_score(self, results_csv_path: tp.Union[Path, str]) -> float:
# collect results for each evaluated pair and return averaged moslqo score.
with open(results_csv_path, "r") as csv_file:
reader = csv.DictReader(csv_file)
moslqo_scores = [float(row["moslqo"]) for row in reader]
if len(moslqo_scores) > 0:
return sum(moslqo_scores) / len(moslqo_scores)
else:
return 0.0
def _collect_debug_data(self, debug_json_path: tp.Union[Path, str]) -> dict:
# collect debug data for the visqol inference.
with open(debug_json_path, "r") as f:
data = json.load(f)
return data
@property
def visqol_model(self):
return f'{self.visqol_bin}/model/{self.model}'
def _run_visqol(
self,
input_csv_path: tp.Union[Path, str],
results_csv_path: tp.Union[Path, str],
debug_csv_path: tp.Optional[tp.Union[Path, str]],
):
input_csv_path = str(input_csv_path)
results_csv_path = str(results_csv_path)
debug_csv_path = str(debug_csv_path)
cmd = [
f'{self.visqol_bin}/bazel-bin/visqol',
'--batch_input_csv', f'{input_csv_path}',
'--results_csv', f'{results_csv_path}'
]
if debug_csv_path is not None:
cmd += ['--output_debug', f'{debug_csv_path}']
if self.visqol_mode == "speech":
cmd += ['--use_speech_mode']
cmd += ['--similarity_to_quality_model', f'{self.visqol_model}']
result = subprocess.run(cmd, capture_output=True)
if result.returncode:
logger.error("Error with visqol: \n %s \n %s", result.stdout.decode(), result.stderr.decode())
raise RuntimeError("Error while executing visqol")
result.check_returncode()
def __call__(
self,
ref_sig: torch.Tensor,
deg_sig: torch.Tensor,
sr: int,
pad_with_silence: bool = False,
):
"""Calculate the ViSQOL metric for a pair of audio signals at a given sample rate.
Args:
ref_sig (torch.Tensor): Reference signals as [B, C, T].
deg_sig (torch.Tensor): Degraded signals as [B, C, T].
sr (int): Sample rate of the two audio signals.
pad_with_silence (bool): Whether to pad the file with silences as recommended
in visqol guidelines (see: https://github.com/google/visqol#general-guidelines-for-input).
Returns:
float: The ViSQOL score or mean score for the batch.
"""
logger.debug(f"Calculating visqol with mode={self.visqol_mode} on {len(ref_sig)} samples")
tmp_dir, input_csv, results_csv, debug_json = self._prepare_files(
ref_sig, deg_sig, sr, self.target_sr, pad_with_silence
)
try:
if input_csv and results_csv:
self._run_visqol(
input_csv,
results_csv,
debug_json if self.debug else None,
)
mosqol = self._collect_moslqo_score(results_csv)
return mosqol
else:
raise RuntimeError("Something unexpected happened when running VISQOL!")
except Exception as e:
logger.error("Exception occurred when running ViSQOL: %s", e)
finally:
self._flush_files(tmp_dir)