# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import csv import json import logging from pathlib import Path import tempfile import typing as tp import subprocess import shutil import torch import torchaudio logger = logging.getLogger(__name__) class ViSQOL: """ViSQOL wrapper to run ViSQOL from Python using a pre-installed binary. To learn more about ViSQOL and how to build ViSQOL binary using bazel, please refer to the instructions available in the open source repository: https://github.com/google/visqol ViSQOL is capable of running in two modes: Audio Mode: When running in audio mode, input signals must have a 48kHz sample rate. Input should be resampled to 48kHz. Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison. Audio mode uses support vector regression, with the maximum range at ~4.75. Speech Mode: When running in speech mode, ViSQOL uses a wideband model. It therefore expects input sample rates of 16kHz. Input should be resampled to 16kHz. As part of the speech mode processing, a root mean square implementation for voice activity detection is performed on the reference signal to determine what parts of the signal have voice activity and should therefore be included in the comparison. The signal is normalized before performing the voice activity detection. Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison. Speech mode is scaled to have a maximum MOS of 5.0 to match previous version behavior. For more details, check the guidelines: https://github.com/google/visqol#general-guidelines-for-input Args: visqol_bin (str): Path to the ViSQOL binary. mode (str): ViSQOL computation mode, expecting "audio" or "speech". model (str): Name of the model to use for similarity to quality model. debug (bool): Whether to also get debug metrics from ViSQOL or not. """ SAMPLE_RATES_MODES = {"audio": 48_000, "speech": 16_000} ALLOWED_SAMPLE_RATES = frozenset(SAMPLE_RATES_MODES.values()) def __init__(self, bin: tp.Union[Path, str], mode: str = "audio", model: str = "libsvm_nu_svr_model.txt", debug: bool = False): assert bin is not None and Path(bin).exists(), f"Could not find ViSQOL binary in specified path: {bin}" self.visqol_bin = str(bin) self.visqol_mode = mode self.target_sr = self._get_target_sr(self.visqol_mode) self.model = model self.debug = debug assert Path(self.visqol_model).exists(), \ f"Could not find the specified model in ViSQOL install: {self.visqol_model}" def _get_target_sr(self, mode: str) -> int: # returns target sampling rate for the corresponding ViSQOL mode. if mode not in ViSQOL.SAMPLE_RATES_MODES: raise ValueError( f"Unsupported mode! Allowed are: {', '.join(ViSQOL.SAMPLE_RATES_MODES.keys())}" ) return ViSQOL.SAMPLE_RATES_MODES[mode] def _prepare_files( self, ref_sig: torch.Tensor, deg_sig: torch.Tensor, sr: int, target_sr: int, pad_with_silence: bool = False ): # prepare files for ViSQOL evaluation. assert target_sr in ViSQOL.ALLOWED_SAMPLE_RATES assert len(ref_sig) == len(deg_sig), ( "Expects same number of ref and degraded inputs", f" but ref len {len(ref_sig)} != deg len {len(deg_sig)}" ) # resample audio if needed if sr != target_sr: transform = torchaudio.transforms.Resample(sr, target_sr) pad = int(0.5 * target_sr) rs_ref = [] rs_deg = [] for i in range(len(ref_sig)): rs_ref_i = transform(ref_sig[i]) rs_deg_i = transform(deg_sig[i]) if pad_with_silence: rs_ref_i = torch.nn.functional.pad(rs_ref_i, (pad, pad), mode='constant', value=0) rs_deg_i = torch.nn.functional.pad(rs_deg_i, (pad, pad), mode='constant', value=0) rs_ref.append(rs_ref_i) rs_deg.append(rs_deg_i) ref_sig = torch.stack(rs_ref) deg_sig = torch.stack(rs_deg) # save audio chunks to tmp dir and create csv tmp_dir = Path(tempfile.mkdtemp()) try: tmp_input_csv_path = tmp_dir / "input.csv" tmp_results_csv_path = tmp_dir / "results.csv" tmp_debug_json_path = tmp_dir / "debug.json" with open(tmp_input_csv_path, "w") as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(["reference", "degraded"]) for i in range(len(ref_sig)): tmp_ref_filename = tmp_dir / f"ref_{i}.wav" tmp_deg_filename = tmp_dir / f"deg_{i}.wav" torchaudio.save( tmp_ref_filename, torch.clamp(ref_sig[i], min=-0.99, max=0.99), sample_rate=target_sr, bits_per_sample=16, encoding="PCM_S" ) torchaudio.save( tmp_deg_filename, torch.clamp(deg_sig[i], min=-0.99, max=0.99), sample_rate=target_sr, bits_per_sample=16, encoding="PCM_S" ) csv_writer.writerow([str(tmp_ref_filename), str(tmp_deg_filename)]) return tmp_dir, tmp_input_csv_path, tmp_results_csv_path, tmp_debug_json_path except Exception as e: logger.error("Exception occurred when preparing files for ViSQOL: %s", e) return tmp_dir, None, None, None def _flush_files(self, tmp_dir: tp.Union[Path, str]): # flush tmp files used to compute ViSQOL. shutil.rmtree(str(tmp_dir)) def _collect_moslqo_score(self, results_csv_path: tp.Union[Path, str]) -> float: # collect results for each evaluated pair and return averaged moslqo score. with open(results_csv_path, "r") as csv_file: reader = csv.DictReader(csv_file) moslqo_scores = [float(row["moslqo"]) for row in reader] if len(moslqo_scores) > 0: return sum(moslqo_scores) / len(moslqo_scores) else: return 0.0 def _collect_debug_data(self, debug_json_path: tp.Union[Path, str]) -> dict: # collect debug data for the visqol inference. with open(debug_json_path, "r") as f: data = json.load(f) return data @property def visqol_model(self): return f'{self.visqol_bin}/model/{self.model}' def _run_visqol( self, input_csv_path: tp.Union[Path, str], results_csv_path: tp.Union[Path, str], debug_csv_path: tp.Optional[tp.Union[Path, str]], ): input_csv_path = str(input_csv_path) results_csv_path = str(results_csv_path) debug_csv_path = str(debug_csv_path) cmd = [ f'{self.visqol_bin}/bazel-bin/visqol', '--batch_input_csv', f'{input_csv_path}', '--results_csv', f'{results_csv_path}' ] if debug_csv_path is not None: cmd += ['--output_debug', f'{debug_csv_path}'] if self.visqol_mode == "speech": cmd += ['--use_speech_mode'] cmd += ['--similarity_to_quality_model', f'{self.visqol_model}'] result = subprocess.run(cmd, capture_output=True) if result.returncode: logger.error("Error with visqol: \n %s \n %s", result.stdout.decode(), result.stderr.decode()) raise RuntimeError("Error while executing visqol") result.check_returncode() def __call__( self, ref_sig: torch.Tensor, deg_sig: torch.Tensor, sr: int, pad_with_silence: bool = False, ): """Calculate the ViSQOL metric for a pair of audio signals at a given sample rate. Args: ref_sig (torch.Tensor): Reference signals as [B, C, T]. deg_sig (torch.Tensor): Degraded signals as [B, C, T]. sr (int): Sample rate of the two audio signals. pad_with_silence (bool): Whether to pad the file with silences as recommended in visqol guidelines (see: https://github.com/google/visqol#general-guidelines-for-input). Returns: float: The ViSQOL score or mean score for the batch. """ logger.debug(f"Calculating visqol with mode={self.visqol_mode} on {len(ref_sig)} samples") tmp_dir, input_csv, results_csv, debug_json = self._prepare_files( ref_sig, deg_sig, sr, self.target_sr, pad_with_silence ) try: if input_csv and results_csv: self._run_visqol( input_csv, results_csv, debug_json if self.debug else None, ) mosqol = self._collect_moslqo_score(results_csv) return mosqol else: raise RuntimeError("Something unexpected happened when running VISQOL!") except Exception as e: logger.error("Exception occurred when running ViSQOL: %s", e) finally: self._flush_files(tmp_dir)