Spaces:

unpairedelectron07
/

Text-to-Music-Generator

Running

App Files Files Community

unpairedelectron07 commited on Jan 21, 2024

Commit

26b4608

verified ·

1 Parent(s): f586664

Upload 6 files

Browse files

Files changed (6) hide show

audiocraft/metrics/chroma_cosinesim.py +72 -0
audiocraft/metrics/clap_consistency.py +84 -0
audiocraft/metrics/fad.py +329 -0
audiocraft/metrics/kld.py +220 -0
audiocraft/metrics/rvm.py +110 -0
audiocraft/metrics/visqol.py +216 -0

audiocraft/metrics/chroma_cosinesim.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torchmetrics
+from ..data.audio_utils import convert_audio
+from ..modules.chroma import ChromaExtractor
+class ChromaCosineSimilarityMetric(torchmetrics.Metric):
+    """Chroma cosine similarity metric.
+    This metric extracts a chromagram for a reference waveform and
+    a generated waveform and compares each frame using the cosine similarity
+    function. The output is the mean cosine similarity.
+    Args:
+        sample_rate (int): Sample rate used by the chroma extractor.
+        n_chroma (int): Number of chroma used by the chroma extractor.
+        radix2_exp (int): Exponent for the chroma extractor.
+        argmax (bool): Whether the chroma extractor uses argmax.
+        eps (float): Epsilon for cosine similarity computation.
+    """
+    def __init__(self, sample_rate: int, n_chroma: int, radix2_exp: int, argmax: bool, eps: float = 1e-8):
+        super().__init__()
+        self.chroma_sample_rate = sample_rate
+        self.n_chroma = n_chroma
+        self.eps = eps
+        self.chroma_extractor = ChromaExtractor(sample_rate=self.chroma_sample_rate, n_chroma=self.n_chroma,
+                                                radix2_exp=radix2_exp, argmax=argmax)
+        self.add_state("cosine_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
+        self.add_state("weight", default=torch.tensor(0.), dist_reduce_fx="sum")
+    def update(self, preds: torch.Tensor, targets: torch.Tensor,
+               sizes: torch.Tensor, sample_rates: torch.Tensor) -> None:
+        """Compute cosine similarity between chromagrams and accumulate scores over the dataset."""
+        if preds.size(0) == 0:
+            return
+        assert preds.shape == targets.shape, (
+            f"Preds and target shapes mismatch: preds={preds.shape}, targets={targets.shape}")
+        assert preds.size(0) == sizes.size(0), (
+            f"Number of items in preds ({preds.shape}) mismatch ",
+            f"with sizes ({sizes.shape})")
+        assert preds.size(0) == sample_rates.size(0), (
+            f"Number of items in preds ({preds.shape}) mismatch ",
+            f"with sample_rates ({sample_rates.shape})")
+        assert torch.all(sample_rates == sample_rates[0].item()), "All sample rates are not the same in the batch"
+        device = self.weight.device
+        preds, targets = preds.to(device), targets.to(device)  # type: ignore
+        sample_rate = sample_rates[0].item()
+        preds = convert_audio(preds, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
+        targets = convert_audio(targets, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
+        gt_chroma = self.chroma_extractor(targets)
+        gen_chroma = self.chroma_extractor(preds)
+        chroma_lens = (sizes / self.chroma_extractor.winhop).ceil().int()
+        for i in range(len(gt_chroma)):
+            t = int(chroma_lens[i].item())
+            cosine_sim = torch.nn.functional.cosine_similarity(
+                gt_chroma[i, :t], gen_chroma[i, :t], dim=1, eps=self.eps)
+            self.cosine_sum += cosine_sim.sum(dim=0)  # type: ignore
+            self.weight += torch.tensor(t)  # type: ignore
+    def compute(self) -> float:
+        """Computes the average cosine similarty across all generated/target chromagrams pairs."""
+        assert self.weight.item() > 0, "Unable to compute with total number of comparisons <= 0"  # type: ignore
+        return (self.cosine_sum / self.weight).item()  # type: ignore

audiocraft/metrics/clap_consistency.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from pathlib import Path
+import typing as tp
+import torch
+import torchmetrics
+from transformers import RobertaTokenizer  # type: ignore
+from ..data.audio_utils import convert_audio
+from ..environment import AudioCraftEnvironment
+from ..utils.utils import load_clap_state_dict
+try:
+    import laion_clap  # type: ignore
+except ImportError:
+    laion_clap = None
+class TextConsistencyMetric(torchmetrics.Metric):
+    """Text consistency metric measuring consistency between audio and text pairs."""
+    def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -> None:
+        raise NotImplementedError("implement how to update the metric from the audio and text pairs.")
+    def compute(self):
+        raise NotImplementedError("implement how to compute the final metric score.")
+class CLAPTextConsistencyMetric(TextConsistencyMetric):
+    """Text consistency metric relying on Contrastive Language-Audio Pretraining (CLAP).
+    This metric is similar to the MuLan Cycle Consistency from MusicLM (https://arxiv.org/pdf/2301.11325.pdf)
+    or the CLAP score used in Make-An-Audio (https://arxiv.org/pdf/2301.12661v1.pdf).
+    As a joint audio-text embedding model, a pretrained CLAP model can be used to quantify the
+    similarity between audio-text pairs. We compute the CLAP embeddings from the text descriptions as
+    well as the generated audio based on them, and define the MCC metric as the average cosine similarity
+    between these embeddings.
+    Model implementation & pre-trained checkpoints: https://github.com/LAION-AI/CLAP
+    """
+    def __init__(self, model_path: tp.Union[str, Path], model_arch: str = 'HTSAT-tiny', enable_fusion: bool = False):
+        super().__init__()
+        if laion_clap is None:
+            raise ImportError("Please install CLAP to compute text consistency: 'pip install laion_clap'")
+        self.add_state("cosine_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
+        self.add_state("weight", default=torch.tensor(0.), dist_reduce_fx="sum")
+        self._initialize_model(model_path, model_arch, enable_fusion)
+    def _initialize_model(self, model_path: tp.Union[str, Path], model_arch: str, enable_fusion: bool):
+        model_path = AudioCraftEnvironment.resolve_reference_path(model_path)
+        self.tokenize = RobertaTokenizer.from_pretrained('roberta-base')
+        self.model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
+        self.model_sample_rate = 48_000
+        load_clap_state_dict(self.model, model_path)
+        self.model.eval()
+    def _tokenizer(self, texts: tp.Union[str, tp.List[str]]) -> dict:
+        # we use the default params from CLAP module here as well
+        return self.tokenize(texts, padding="max_length", truncation=True, max_length=77, return_tensors="pt")
+    def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -> None:
+        """Compute cosine similarity between audio and text pairs and accumulate scores over the dataset."""
+        assert audio.size(0) == len(text), "Number of audio and text samples should match"
+        assert torch.all(sample_rates == sample_rates[0].item()), "All items in batch should have the same sample rate"
+        sample_rate = int(sample_rates[0].item())
+        # convert audio batch to 48kHz monophonic audio with no channel dimension: [B, C, T] -> [B, T]
+        audio = convert_audio(audio, from_rate=sample_rate, to_rate=self.model_sample_rate, to_channels=1).mean(dim=1)
+        audio_embeddings = self.model.get_audio_embedding_from_data(audio, use_tensor=True)
+        text_embeddings = self.model.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
+        # cosine similarity between the text and the audio embedding
+        cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_embeddings, dim=1, eps=1e-8)
+        self.cosine_sum += cosine_sim.sum(dim=0)
+        self.weight += torch.tensor(cosine_sim.size(0))
+    def compute(self):
+        """Computes the average cosine similarty across all audio/text pairs."""
+        assert self.weight.item() > 0, "Unable to compute with total number of comparisons <= 0"  # type: ignore
+        return (self.cosine_sum / self.weight).item()  # type: ignore

audiocraft/metrics/fad.py ADDED Viewed

	@@ -0,0 +1,329 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from pathlib import Path
+import os
+import subprocess
+import tempfile
+import typing as tp
+from audiocraft.data.audio import audio_write
+from audiocraft.data.audio_utils import convert_audio
+import flashy
+import torch
+import torchmetrics
+from ..environment import AudioCraftEnvironment
+logger = logging.getLogger(__name__)
+VGGISH_SAMPLE_RATE = 16_000
+VGGISH_CHANNELS = 1
+class FrechetAudioDistanceMetric(torchmetrics.Metric):
+    """Fréchet Audio Distance computation based on official TensorFlow implementation from Google Research.
+    From: D.C. Dowson & B.V. Landau The Fréchet distance between
+    multivariate normal distributions
+    https://doi.org/10.1016/0047-259X(82)90077-X
+    The Fréchet distance between two multivariate gaussians,
+    `X ~ N(mu_x, sigma_x)` and `Y ~ N(mu_y, sigma_y)`, is `d^2`.
+    d^2 = (mu_x - mu_y)^2 + Tr(sigma_x + sigma_y - 2 * sqrt(sigma_x*sigma_y))
+        = (mu_x - mu_y)^2 + Tr(sigma_x) + Tr(sigma_y)
+                        - 2 * Tr(sqrt(sigma_x*sigma_y)))
+    To use this FAD computation metric, you need to have the proper Frechet Audio Distance tool setup
+    from: https://github.com/google-research/google-research/tree/master/frechet_audio_distance
+    We provide the below instructions as reference but we do not guarantee for further support
+    in frechet_audio_distance installation. This was tested with python 3.10, cuda 11.8, tensorflow 2.12.0.
+        We recommend installing the frechet_audio_distance library in a dedicated env (e.g. conda).
+        1. Get the code and models following the repository instructions. We used the steps below:
+                git clone git@github.com:google-research/google-research.git
+                git clone git@github.com:tensorflow/models.git
+                mkdir google-research/tensorflow_models
+                touch google-research/tensorflow_models/__init__.py
+                cp -r models/research/audioset google-research/tensorflow_models/
+                touch google-research/tensorflow_models/audioset/__init__.py
+                echo "from .vggish import mel_features, vggish_params, vggish_slim" > \
+                    google-research/tensorflow_models/audioset/__init__.py
+                # we can now remove the tensorflow models repository
+                # rm -r models
+                cd google-research
+           Follow the instructions to download the vggish checkpoint. AudioCraft base configuration
+           assumes it is placed in the AudioCraft reference dir.
+           Note that we operate the following changes for the code to work with TensorFlow 2.X and python 3:
+           - Update xrange for range in:
+             https://github.com/google-research/google-research/blob/master/frechet_audio_distance/audioset_model.py
+           - Update `tf_record = tf.python_io.tf_record_iterator(filename).next()` to
+             `tf_record = tf.python_io.tf_record_iterator(filename).__next__()` in
+              https://github.com/google-research/google-research/blob/master/frechet_audio_distance/fad_utils.py
+           - Update `import vggish_params as params` to `from . import vggish_params as params` in:
+             https://github.com/tensorflow/models/blob/master/research/audioset/vggish/vggish_slim.py
+           - Add flag to provide a given batch size for running the AudioSet model in:
+             https://github.com/google-research/google-research/blob/master/frechet_audio_distance/create_embeddings_main.py
+             ```
+             flags.DEFINE_integer('batch_size', 64,
+                                  'Number of samples in the batch for AudioSet model.')
+             ```
+             Ensure you pass the flag to the create_embeddings_beam.create_pipeline function, adding:
+             `batch_size=FLAGS.batch_size` to the provided parameters.
+        2. Follow instructions for the library installation and a valid TensorFlow installation
+           ```
+           # e.g. instructions from: https://www.tensorflow.org/install/pip
+           conda install -c conda-forge cudatoolkit=11.8.0
+           python3 -m pip install nvidia-cudnn-cu11==8.6.0.163 tensorflow==2.12.*
+           mkdir -p $CONDA_PREFIX/etc/conda/activate.d
+           echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' \
+             >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+           echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$CUDNN_PATH/lib' \
+             >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+           source $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+           # Verify install: on a machine with GPU device
+           python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
+           ```
+           Now install frechet_audio_distance required dependencies:
+           ```
+           # We assume we already have TensorFlow installed from the above steps
+           pip install apache-beam numpy scipy tf_slim
+           ```
+           Finally, follow remaining library instructions to ensure you have a working frechet_audio_distance setup
+           (you may want to specify --model_ckpt flag pointing to the model's path).
+        3. AudioCraft's FrechetAudioDistanceMetric requires 2 environment variables pointing to the python executable
+           and Tensorflow library path from the above installation steps:
+            export TF_PYTHON_EXE="<PATH_TO_THE_ENV_PYTHON_BINARY>"
+            export TF_LIBRARY_PATH="<PATH_TO_THE_ENV_CUDNN_LIBRARY>"
+            e.g. assuming we have installed everything in a dedicated conda env
+            with python 3.10 that is currently active:
+            export TF_PYTHON_EXE="$CONDA_PREFIX/bin/python"
+            export TF_LIBRARY_PATH="$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/cudnn/lib"
+            Finally you may want to export the following variable:
+            export TF_FORCE_GPU_ALLOW_GROWTH=true
+            See: https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth
+            You can save those environment variables in your training conda env, when currently active:
+            `$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh`
+            e.g. assuming the env with TensorFlow and frechet_audio_distance install is named ac_eval,
+            and the training conda env is named audiocraft:
+            ```
+            # activate training env
+            conda activate audiocraft
+            # get path to all envs
+            CONDA_ENV_DIR=$(dirname $CONDA_PREFIX)
+            # export pointers to evaluation env for using TensorFlow in FrechetAudioDistanceMetric
+            touch $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            echo 'export TF_PYTHON_EXE="$CONDA_ENV_DIR/ac_eval/bin/python"' >> \
+                $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            echo 'export TF_LIBRARY_PATH="$CONDA_ENV_DIR/ac_eval/lib/python3.10/site-packages/nvidia/cudnn/lib"' >> \
+                $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            # optionally:
+            echo 'export TF_FORCE_GPU_ALLOW_GROWTH=true' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
+            # you may need to reactivate the audiocraft env for this to take effect
+            ```
+    Args:
+        bin (Path or str): Path to installed frechet audio distance code.
+        model_path (Path or str): Path to Tensorflow checkpoint for the model
+            used to compute statistics over the embedding beams.
+        format (str): Audio format used to save files.
+        log_folder (Path or str, optional): Path where to write process logs.
+    """
+    def __init__(self, bin: tp.Union[Path, str], model_path: tp.Union[Path, str],
+                 format: str = "wav", batch_size: tp.Optional[int] = None,
+                 log_folder: tp.Optional[tp.Union[Path, str]] = None):
+        super().__init__()
+        self.model_sample_rate = VGGISH_SAMPLE_RATE
+        self.model_channels = VGGISH_CHANNELS
+        self.model_path = AudioCraftEnvironment.resolve_reference_path(model_path)
+        assert Path(self.model_path).exists(), f"Could not find provided model checkpoint path at: {self.model_path}"
+        self.format = format
+        self.batch_size = batch_size
+        self.bin = bin
+        self.tf_env = {"PYTHONPATH": str(self.bin)}
+        self.python_path = os.environ.get('TF_PYTHON_EXE') or 'python'
+        logger.info("Python exe for TF is  %s", self.python_path)
+        if 'TF_LIBRARY_PATH' in os.environ:
+            self.tf_env['LD_LIBRARY_PATH'] = os.environ['TF_LIBRARY_PATH']
+        if 'TF_FORCE_GPU_ALLOW_GROWTH' in os.environ:
+            self.tf_env['TF_FORCE_GPU_ALLOW_GROWTH'] = os.environ['TF_FORCE_GPU_ALLOW_GROWTH']
+        logger.info("Env for TF is %r", self.tf_env)
+        self.reset(log_folder)
+        self.add_state("total_files", default=torch.tensor(0.), dist_reduce_fx="sum")
+    def reset(self, log_folder: tp.Optional[tp.Union[Path, str]] = None):
+        """Reset torchmetrics.Metrics state."""
+        log_folder = Path(log_folder or tempfile.mkdtemp())
+        self.tmp_dir = log_folder / 'fad'
+        self.tmp_dir.mkdir(exist_ok=True)
+        self.samples_tests_dir = self.tmp_dir / 'tests'
+        self.samples_tests_dir.mkdir(exist_ok=True)
+        self.samples_background_dir = self.tmp_dir / 'background'
+        self.samples_background_dir.mkdir(exist_ok=True)
+        self.manifest_tests = self.tmp_dir / 'files_tests.cvs'
+        self.manifest_background = self.tmp_dir / 'files_background.cvs'
+        self.stats_tests_dir = self.tmp_dir / 'stats_tests'
+        self.stats_background_dir = self.tmp_dir / 'stats_background'
+        self.counter = 0
+    def update(self, preds: torch.Tensor, targets: torch.Tensor,
+               sizes: torch.Tensor, sample_rates: torch.Tensor,
+               stems: tp.Optional[tp.List[str]] = None):
+        """Update torchmetrics.Metrics by saving the audio and updating the manifest file."""
+        assert preds.shape == targets.shape, f"preds={preds.shape} != targets={targets.shape}"
+        num_samples = preds.shape[0]
+        assert num_samples == sizes.size(0) and num_samples == sample_rates.size(0)
+        assert stems is None or num_samples == len(set(stems))
+        for i in range(num_samples):
+            self.total_files += 1  # type: ignore
+            self.counter += 1
+            wav_len = int(sizes[i].item())
+            sample_rate = int(sample_rates[i].item())
+            pred_wav = preds[i]
+            target_wav = targets[i]
+            pred_wav = pred_wav[..., :wav_len]
+            target_wav = target_wav[..., :wav_len]
+            stem_name = stems[i] if stems is not None else f'sample_{self.counter}_{flashy.distrib.rank()}'
+            # dump audio files
+            try:
+                pred_wav = convert_audio(
+                    pred_wav.unsqueeze(0), from_rate=sample_rate,
+                    to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
+                audio_write(
+                    self.samples_tests_dir / stem_name, pred_wav, sample_rate=self.model_sample_rate,
+                    format=self.format, strategy="peak")
+            except Exception as e:
+                logger.error(f"Exception occured when saving tests files for FAD computation: {repr(e)} - {e}")
+            try:
+                # for the ground truth audio, we enforce the 'peak' strategy to avoid modifying
+                # the original audio when writing it
+                target_wav = convert_audio(
+                    target_wav.unsqueeze(0), from_rate=sample_rate,
+                    to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
+                audio_write(
+                    self.samples_background_dir / stem_name, target_wav, sample_rate=self.model_sample_rate,
+                    format=self.format, strategy="peak")
+            except Exception as e:
+                logger.error(f"Exception occured when saving background files for FAD computation: {repr(e)} - {e}")
+    def _get_samples_name(self, is_background: bool):
+        return 'background' if is_background else 'tests'
+    def _create_embedding_beams(self, is_background: bool, gpu_index: tp.Optional[int] = None):
+        if is_background:
+            input_samples_dir = self.samples_background_dir
+            input_filename = self.manifest_background
+            stats_name = self.stats_background_dir
+        else:
+            input_samples_dir = self.samples_tests_dir
+            input_filename = self.manifest_tests
+            stats_name = self.stats_tests_dir
+        beams_name = self._get_samples_name(is_background)
+        log_file = self.tmp_dir / f'fad_logs_create_beams_{beams_name}.log'
+        logger.info(f"Scanning samples folder to fetch list of files: {input_samples_dir}")
+        with open(input_filename, "w") as fout:
+            for path in Path(input_samples_dir).glob(f"*.{self.format}"):
+                fout.write(f"{str(path)}\n")
+        cmd = [
+            self.python_path, "-m",
+            "frechet_audio_distance.create_embeddings_main",
+            "--model_ckpt", f"{self.model_path}",
+            "--input_files", f"{str(input_filename)}",
+            "--stats", f"{str(stats_name)}",
+        ]
+        if self.batch_size is not None:
+            cmd += ["--batch_size", str(self.batch_size)]
+        logger.info(f"Launching frechet_audio_distance embeddings main method: {' '.join(cmd)} on {beams_name}")
+        env = os.environ
+        if gpu_index is not None:
+            env["CUDA_VISIBLE_DEVICES"] = str(gpu_index)
+        process = subprocess.Popen(
+            cmd, stdout=open(log_file, "w"), env={**env, **self.tf_env}, stderr=subprocess.STDOUT)
+        return process, log_file
+    def _compute_fad_score(self, gpu_index: tp.Optional[int] = None):
+        cmd = [
+            self.python_path, "-m", "frechet_audio_distance.compute_fad",
+            "--test_stats", f"{str(self.stats_tests_dir)}",
+            "--background_stats", f"{str(self.stats_background_dir)}",
+        ]
+        logger.info(f"Launching frechet_audio_distance compute fad method: {' '.join(cmd)}")
+        env = os.environ
+        if gpu_index is not None:
+            env["CUDA_VISIBLE_DEVICES"] = str(gpu_index)
+        result = subprocess.run(cmd, env={**env, **self.tf_env}, capture_output=True)
+        if result.returncode:
+            logger.error(
+                "Error with FAD computation from stats: \n %s \n %s",
+                result.stdout.decode(), result.stderr.decode()
+            )
+            raise RuntimeError("Error while executing FAD computation from stats")
+        try:
+            # result is "FAD: (d+).(d+)" hence we remove the prefix with (d+) being one digit or more
+            fad_score = float(result.stdout[4:])
+            return fad_score
+        except Exception as e:
+            raise RuntimeError(f"Error parsing FAD score from command stdout: {e}")
+    def _log_process_result(self, returncode: int, log_file: tp.Union[Path, str], is_background: bool) -> None:
+        beams_name = self._get_samples_name(is_background)
+        if returncode:
+            with open(log_file, "r") as f:
+                error_log = f.read()
+                logger.error(error_log)
+            os._exit(1)
+        else:
+            logger.info(f"Successfully computed embedding beams on {beams_name} samples.")
+    def _parallel_create_embedding_beams(self, num_of_gpus: int):
+        assert num_of_gpus > 0
+        logger.info("Creating embeddings beams in a parallel manner on different GPUs")
+        tests_beams_process, tests_beams_log_file = self._create_embedding_beams(is_background=False, gpu_index=0)
+        bg_beams_process, bg_beams_log_file = self._create_embedding_beams(is_background=True, gpu_index=1)
+        tests_beams_code = tests_beams_process.wait()
+        bg_beams_code = bg_beams_process.wait()
+        self._log_process_result(tests_beams_code, tests_beams_log_file, is_background=False)
+        self._log_process_result(bg_beams_code, bg_beams_log_file, is_background=True)
+    def _sequential_create_embedding_beams(self):
+        logger.info("Creating embeddings beams in a sequential manner")
+        tests_beams_process, tests_beams_log_file = self._create_embedding_beams(is_background=False)
+        tests_beams_code = tests_beams_process.wait()
+        self._log_process_result(tests_beams_code, tests_beams_log_file, is_background=False)
+        bg_beams_process, bg_beams_log_file = self._create_embedding_beams(is_background=True)
+        bg_beams_code = bg_beams_process.wait()
+        self._log_process_result(bg_beams_code, bg_beams_log_file, is_background=True)
+    @flashy.distrib.rank_zero_only
+    def _local_compute_frechet_audio_distance(self):
+        """Compute Frechet Audio Distance score calling TensorFlow API."""
+        num_of_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+        if num_of_gpus > 1:
+            self._parallel_create_embedding_beams(num_of_gpus)
+        else:
+            self._sequential_create_embedding_beams()
+        fad_score = self._compute_fad_score(gpu_index=0)
+        return fad_score
+    def compute(self) -> float:
+        """Compute metrics."""
+        assert self.total_files.item() > 0, "No files dumped for FAD computation!"  # type: ignore
+        fad_score = self._local_compute_frechet_audio_distance()
+        logger.warning(f"FAD score = {fad_score}")
+        fad_score = flashy.distrib.broadcast_object(fad_score, src=0)
+        return fad_score

audiocraft/metrics/kld.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import contextlib
+from functools import partial
+import logging
+import os
+import typing as tp
+import torch
+import torchmetrics
+from ..data.audio_utils import convert_audio
+logger = logging.getLogger(__name__)
+class _patch_passt_stft:
+    """Decorator to patch torch.stft in PaSST."""
+    def __init__(self):
+        self.old_stft = torch.stft
+    def __enter__(self):
+        # return_complex is a mandatory parameter in latest torch versions
+        # torch is throwing RuntimeErrors when not set
+        torch.stft = partial(torch.stft, return_complex=False)
+    def __exit__(self, *exc):
+        torch.stft = self.old_stft
+def kl_divergence(pred_probs: torch.Tensor, target_probs: torch.Tensor, epsilon: float = 1e-6) -> torch.Tensor:
+    """Computes the elementwise KL-Divergence loss between probability distributions
+    from generated samples and target samples.
+    Args:
+        pred_probs (torch.Tensor): Probabilities for each label obtained
+            from a classifier on generated audio. Expected shape is [B, num_classes].
+        target_probs (torch.Tensor): Probabilities for each label obtained
+            from a classifier on target audio. Expected shape is [B, num_classes].
+        epsilon (float): Epsilon value.
+    Returns:
+        kld (torch.Tensor): KLD loss between each generated sample and target pair.
+    """
+    kl_div = torch.nn.functional.kl_div((pred_probs + epsilon).log(), target_probs, reduction="none")
+    return kl_div.sum(-1)
+class KLDivergenceMetric(torchmetrics.Metric):
+    """Base implementation for KL Divergence metric.
+    The KL divergence is measured between probability distributions
+    of class predictions returned by a pre-trained audio classification model.
+    When the KL-divergence is low, the generated audio is expected to
+    have similar acoustic characteristics as the reference audio,
+    according to the classifier.
+    """
+    def __init__(self):
+        super().__init__()
+        self.add_state("kld_pq_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
+        self.add_state("kld_qp_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
+        self.add_state("kld_all_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
+        self.add_state("weight", default=torch.tensor(0), dist_reduce_fx="sum")
+    def _get_label_distribution(self, x: torch.Tensor, sizes: torch.Tensor,
+                                sample_rates: torch.Tensor) -> tp.Optional[torch.Tensor]:
+        """Get model output given provided input tensor.
+        Args:
+            x (torch.Tensor): Input audio tensor of shape [B, C, T].
+            sizes (torch.Tensor): Actual audio sample length, of shape [B].
+            sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+        Returns:
+            probs (torch.Tensor): Probabilities over labels, of shape [B, num_classes].
+        """
+        raise NotImplementedError("implement method to extract label distributions from the model.")
+    def update(self, preds: torch.Tensor, targets: torch.Tensor,
+               sizes: torch.Tensor, sample_rates: torch.Tensor) -> None:
+        """Calculates running KL-Divergence loss between batches of audio
+        preds (generated) and target (ground-truth)
+        Args:
+            preds (torch.Tensor): Audio samples to evaluate, of shape [B, C, T].
+            targets (torch.Tensor): Target samples to compare against, of shape [B, C, T].
+            sizes (torch.Tensor): Actual audio sample length, of shape [B].
+            sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+        """
+        assert preds.shape == targets.shape
+        assert preds.size(0) > 0, "Cannot update the loss with empty tensors"
+        preds_probs = self._get_label_distribution(preds, sizes, sample_rates)
+        targets_probs = self._get_label_distribution(targets, sizes, sample_rates)
+        if preds_probs is not None and targets_probs is not None:
+            assert preds_probs.shape == targets_probs.shape
+            kld_scores = kl_divergence(preds_probs, targets_probs)
+            assert not torch.isnan(kld_scores).any(), "kld_scores contains NaN value(s)!"
+            self.kld_pq_sum += torch.sum(kld_scores)
+            kld_qp_scores = kl_divergence(targets_probs, preds_probs)
+            self.kld_qp_sum += torch.sum(kld_qp_scores)
+            self.weight += torch.tensor(kld_scores.size(0))
+    def compute(self) -> dict:
+        """Computes KL-Divergence across all evaluated pred/target pairs."""
+        weight: float = float(self.weight.item())  # type: ignore
+        assert weight > 0, "Unable to compute with total number of comparisons <= 0"
+        logger.info(f"Computing KL divergence on a total of {weight} samples")
+        kld_pq = self.kld_pq_sum.item() / weight  # type: ignore
+        kld_qp = self.kld_qp_sum.item() / weight  # type: ignore
+        kld_both = kld_pq + kld_qp
+        return {'kld': kld_pq, 'kld_pq': kld_pq, 'kld_qp': kld_qp, 'kld_both': kld_both}
+class PasstKLDivergenceMetric(KLDivergenceMetric):
+    """KL-Divergence metric based on pre-trained PASST classifier on AudioSet.
+    From: PaSST: Efficient Training of Audio Transformers with Patchout
+    Paper: https://arxiv.org/abs/2110.05069
+    Implementation: https://github.com/kkoutini/PaSST
+    Follow instructions from the github repo:
+    ```
+    pip install 'git+https://github.com/kkoutini/passt_hear21@0.0.19#egg=hear21passt'
+    ```
+    Args:
+        pretrained_length (float, optional): Audio duration used for the pretrained model.
+    """
+    def __init__(self, pretrained_length: tp.Optional[float] = None):
+        super().__init__()
+        self._initialize_model(pretrained_length)
+    def _initialize_model(self, pretrained_length: tp.Optional[float] = None):
+        """Initialize underlying PaSST audio classifier."""
+        model, sr, max_frames, min_frames = self._load_base_model(pretrained_length)
+        self.min_input_frames = min_frames
+        self.max_input_frames = max_frames
+        self.model_sample_rate = sr
+        self.model = model
+        self.model.eval()
+        self.model.to(self.device)
+    def _load_base_model(self, pretrained_length: tp.Optional[float]):
+        """Load pretrained model from PaSST."""
+        try:
+            if pretrained_length == 30:
+                from hear21passt.base30sec import get_basic_model  # type: ignore
+                max_duration = 30
+            elif pretrained_length == 20:
+                from hear21passt.base20sec import get_basic_model  # type: ignore
+                max_duration = 20
+            else:
+                from hear21passt.base import get_basic_model  # type: ignore
+                # Original PASST was trained on AudioSet with 10s-long audio samples
+                max_duration = 10
+            min_duration = 0.15
+            min_duration = 0.15
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "Please install hear21passt to compute KL divergence: ",
+                "pip install 'git+https://github.com/kkoutini/passt_hear21@0.0.19#egg=hear21passt'"
+            )
+        model_sample_rate = 32_000
+        max_input_frames = int(max_duration * model_sample_rate)
+        min_input_frames = int(min_duration * model_sample_rate)
+        with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f):
+            model = get_basic_model(mode='logits')
+        return model, model_sample_rate, max_input_frames, min_input_frames
+    def _process_audio(self, wav: torch.Tensor, sample_rate: int, wav_len: int) -> tp.List[torch.Tensor]:
+        """Process audio to feed to the pretrained model."""
+        wav = wav.unsqueeze(0)
+        wav = wav[..., :wav_len]
+        wav = convert_audio(wav, from_rate=sample_rate, to_rate=self.model_sample_rate, to_channels=1)
+        wav = wav.squeeze(0)
+        # we don't pad but return a list of audio segments as this otherwise affects the KLD computation
+        segments = torch.split(wav, self.max_input_frames, dim=-1)
+        valid_segments = []
+        for s in segments:
+            # ignoring too small segments that are breaking the model inference
+            if s.size(-1) > self.min_input_frames:
+                valid_segments.append(s)
+        return [s[None] for s in valid_segments]
+    def _get_model_preds(self, wav: torch.Tensor) -> torch.Tensor:
+        """Run the pretrained model and get the predictions."""
+        assert wav.dim() == 3, f"Unexpected number of dims for preprocessed wav: {wav.shape}"
+        wav = wav.mean(dim=1)
+        # PaSST is printing a lot of garbage that we are not interested in
+        with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
+            with torch.no_grad(), _patch_passt_stft():
+                logits = self.model(wav.to(self.device))
+                probs = torch.softmax(logits, dim=-1)
+                return probs
+    def _get_label_distribution(self, x: torch.Tensor, sizes: torch.Tensor,
+                                sample_rates: torch.Tensor) -> tp.Optional[torch.Tensor]:
+        """Get model output given provided input tensor.
+        Args:
+            x (torch.Tensor): Input audio tensor of shape [B, C, T].
+            sizes (torch.Tensor): Actual audio sample length, of shape [B].
+            sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
+        Returns:
+            probs (torch.Tensor, optional): Probabilities over labels, of shape [B, num_classes].
+        """
+        all_probs: tp.List[torch.Tensor] = []
+        for i, wav in enumerate(x):
+            sample_rate = int(sample_rates[i].item())
+            wav_len = int(sizes[i].item())
+            wav_segments = self._process_audio(wav, sample_rate, wav_len)
+            for segment in wav_segments:
+                probs = self._get_model_preds(segment).mean(dim=0)
+                all_probs.append(probs)
+        if len(all_probs) > 0:
+            return torch.stack(all_probs, dim=0)
+        else:
+            return None

audiocraft/metrics/rvm.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import torch
+from torch import nn
+import torchaudio
+def db_to_scale(volume: tp.Union[float, torch.Tensor]):
+    return 10 ** (volume / 20)
+def scale_to_db(scale: torch.Tensor, min_volume: float = -120):
+    min_scale = db_to_scale(min_volume)
+    return 20 * torch.log10(scale.clamp(min=min_scale))
+class RelativeVolumeMel(nn.Module):
+    """Relative volume melspectrogram measure.
+    Computes a measure of distance over two mel spectrogram that is interpretable in terms
+    of decibels. Given `x_ref` and `x_est` two waveforms of shape `[*, T]`, it will
+    first renormalize both by the ground truth of `x_ref`.
+    ..Warning:: This class returns the volume of the distortion at the spectrogram level,
+        e.g. low negative values reflects lower distortion levels. For a SNR (like reported
+        in the MultiBandDiffusion paper), just take `-rvm`.
+    Then it computes the mel spectrogram `z_ref` and `z_est` and compute volume of the difference
+    relative to the volume of `z_ref` for each time-frequency bin. It further adds some limits, e.g.
+    clamping the values between -25 and 25 dB (controlled by `min_relative_volume` and `max_relative_volume`)
+    with the goal of avoiding the loss being dominated by parts where the reference is almost silent.
+    Indeed, volumes in dB can take unbounded values both towards -oo and +oo, which can make the final
+    average metric harder to interpret. Besides, anything below -30 dB of attenuation would sound extremely
+    good (for a neural network output, although sound engineers typically aim for much lower attenuations).
+    Similarly, anything above +30 dB would just be completely missing the target, and there is no point
+    in measuring by exactly how much it missed it. -25, 25 is a more conservative range, but also more
+    in line with what neural nets currently can achieve.
+    For instance, a Relative Volume Mel (RVM) score of -10 dB means that on average, the delta between
+    the target and reference mel-spec is 10 dB lower than the reference mel-spec value.
+    The metric can be aggregated over a given frequency band in order have different insights for
+    different region of the spectrum. `num_aggregated_bands` controls the number of bands.
+    ..Warning:: While this function is optimized for interpretability, nothing was done to ensure it
+        is numerically stable when computing its gradient. We thus advise against using it as a training loss.
+    Args:
+        sample_rate (int): Sample rate of the input audio.
+        n_mels (int): Number of mel bands to use.
+        n_fft (int): Number of frequency bins for the STFT.
+        hop_length (int): Hop length of the STFT and the mel-spectrogram.
+        min_relative_volume (float): The error `z_ref - z_est` volume is given relative to
+            the volume of `z_ref`. If error is smaller than -25 dB of `z_ref`, then it is clamped.
+        max_relative_volume (float): Same as `min_relative_volume` but clamping if the error is larger than that.
+        max_initial_gain (float): When rescaling the audio at the very beginning, we will limit the gain
+            to that amount, to avoid rescaling near silence. Given in dB.
+        min_activity_volume (float): When computing the reference level from `z_ref`, will clamp low volume
+            bins to that amount. This is effectively our "zero" level for the reference mel-spectrogram,
+            and anything below that will be considered equally.
+        num_aggregated_bands (int): Number of bands to keep when computing the average RVM value.
+            For instance, a value of 3 would give 3 scores, roughly for low, mid and high freqs.
+    """
+    def __init__(self, sample_rate: int = 24000, n_mels: int = 80, n_fft: int = 512,
+                 hop_length: int = 128, min_relative_volume: float = -25,
+                 max_relative_volume: float = 25, max_initial_gain: float = 25,
+                 min_activity_volume: float = -25,
+                 num_aggregated_bands: int = 4) -> None:
+        super().__init__()
+        self.melspec = torchaudio.transforms.MelSpectrogram(
+            n_mels=n_mels, n_fft=n_fft, hop_length=hop_length,
+            normalized=True, sample_rate=sample_rate, power=2)
+        self.min_relative_volume = min_relative_volume
+        self.max_relative_volume = max_relative_volume
+        self.max_initial_gain = max_initial_gain
+        self.min_activity_volume = min_activity_volume
+        self.num_aggregated_bands = num_aggregated_bands
+    def forward(self, estimate: torch.Tensor, ground_truth: torch.Tensor) -> tp.Dict[str, torch.Tensor]:
+        """Compute RVM metric between estimate and reference samples.
+        Args:
+            estimate (torch.Tensor): Estimate sample.
+            ground_truth (torch.Tensor): Reference sample.
+        Returns:
+            dict[str, torch.Tensor]: Metrics with keys `rvm` for the overall average, and `rvm_{k}`
+            for the RVM over the k-th band (k=0..num_aggregated_bands - 1).
+        """
+        min_scale = db_to_scale(-self.max_initial_gain)
+        std = ground_truth.pow(2).mean().sqrt().clamp(min=min_scale)
+        z_gt = self.melspec(ground_truth / std).sqrt()
+        z_est = self.melspec(estimate / std).sqrt()
+        delta = z_gt - z_est
+        ref_db = scale_to_db(z_gt, self.min_activity_volume)
+        delta_db = scale_to_db(delta.abs(), min_volume=-120)
+        relative_db = (delta_db - ref_db).clamp(self.min_relative_volume, self.max_relative_volume)
+        dims = list(range(relative_db.dim()))
+        dims.remove(dims[-2])
+        losses_per_band = relative_db.mean(dim=dims)
+        aggregated = [chunk.mean() for chunk in losses_per_band.chunk(self.num_aggregated_bands, dim=0)]
+        metrics = {f'rvm_{index}': value for index, value in enumerate(aggregated)}
+        metrics['rvm'] = losses_per_band.mean()
+        return metrics

audiocraft/metrics/visqol.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import csv
+import json
+import logging
+from pathlib import Path
+import tempfile
+import typing as tp
+import subprocess
+import shutil
+import torch
+import torchaudio
+logger = logging.getLogger(__name__)
+class ViSQOL:
+    """ViSQOL wrapper to run ViSQOL from Python using a pre-installed binary.
+    To learn more about ViSQOL and how to build ViSQOL binary using bazel, please refer to the
+    instructions available in the open source repository: https://github.com/google/visqol
+    ViSQOL is capable of running in two modes:
+    Audio Mode:
+        When running in audio mode, input signals must have a 48kHz sample rate. Input should be resampled to 48kHz.
+        Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
+        Audio mode uses support vector regression, with the maximum range at ~4.75.
+    Speech Mode:
+        When running in speech mode, ViSQOL uses a wideband model. It therefore expects input sample rates of 16kHz.
+            Input should be resampled to 16kHz.
+        As part of the speech mode processing, a root mean square implementation for voice activity detection
+            is performed on the reference signal to determine what parts of the signal have voice activity and
+            should therefore be included in the comparison. The signal is normalized before performing the voice
+            activity detection.
+        Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
+        Speech mode is scaled to have a maximum MOS of 5.0 to match previous version behavior.
+    For more details, check the guidelines: https://github.com/google/visqol#general-guidelines-for-input
+    Args:
+        visqol_bin (str): Path to the ViSQOL binary.
+        mode (str): ViSQOL computation mode, expecting "audio" or "speech".
+        model (str): Name of the model to use for similarity to quality model.
+        debug (bool): Whether to also get debug metrics from ViSQOL or not.
+    """
+    SAMPLE_RATES_MODES = {"audio": 48_000, "speech": 16_000}
+    ALLOWED_SAMPLE_RATES = frozenset(SAMPLE_RATES_MODES.values())
+    def __init__(self, bin: tp.Union[Path, str], mode: str = "audio",
+                 model: str = "libsvm_nu_svr_model.txt", debug: bool = False):
+        assert bin is not None and Path(bin).exists(), f"Could not find ViSQOL binary in specified path: {bin}"
+        self.visqol_bin = str(bin)
+        self.visqol_mode = mode
+        self.target_sr = self._get_target_sr(self.visqol_mode)
+        self.model = model
+        self.debug = debug
+        assert Path(self.visqol_model).exists(), \
+            f"Could not find the specified model in ViSQOL install: {self.visqol_model}"
+    def _get_target_sr(self, mode: str) -> int:
+        # returns target sampling rate for the corresponding ViSQOL mode.
+        if mode not in ViSQOL.SAMPLE_RATES_MODES:
+            raise ValueError(
+                f"Unsupported mode! Allowed are: {', '.join(ViSQOL.SAMPLE_RATES_MODES.keys())}"
+            )
+        return ViSQOL.SAMPLE_RATES_MODES[mode]
+    def _prepare_files(
+        self, ref_sig: torch.Tensor, deg_sig: torch.Tensor, sr: int, target_sr: int, pad_with_silence: bool = False
+    ):
+        # prepare files for ViSQOL evaluation.
+        assert target_sr in ViSQOL.ALLOWED_SAMPLE_RATES
+        assert len(ref_sig) == len(deg_sig), (
+            "Expects same number of ref and degraded inputs",
+            f" but ref len {len(ref_sig)} != deg len {len(deg_sig)}"
+        )
+        # resample audio if needed
+        if sr != target_sr:
+            transform = torchaudio.transforms.Resample(sr, target_sr)
+            pad = int(0.5 * target_sr)
+            rs_ref = []
+            rs_deg = []
+            for i in range(len(ref_sig)):
+                rs_ref_i = transform(ref_sig[i])
+                rs_deg_i = transform(deg_sig[i])
+                if pad_with_silence:
+                    rs_ref_i = torch.nn.functional.pad(rs_ref_i, (pad, pad), mode='constant', value=0)
+                    rs_deg_i = torch.nn.functional.pad(rs_deg_i, (pad, pad), mode='constant', value=0)
+                rs_ref.append(rs_ref_i)
+                rs_deg.append(rs_deg_i)
+            ref_sig = torch.stack(rs_ref)
+            deg_sig = torch.stack(rs_deg)
+        # save audio chunks to tmp dir and create csv
+        tmp_dir = Path(tempfile.mkdtemp())
+        try:
+            tmp_input_csv_path = tmp_dir / "input.csv"
+            tmp_results_csv_path = tmp_dir / "results.csv"
+            tmp_debug_json_path = tmp_dir / "debug.json"
+            with open(tmp_input_csv_path, "w") as csv_file:
+                csv_writer = csv.writer(csv_file)
+                csv_writer.writerow(["reference", "degraded"])
+                for i in range(len(ref_sig)):
+                    tmp_ref_filename = tmp_dir / f"ref_{i}.wav"
+                    tmp_deg_filename = tmp_dir / f"deg_{i}.wav"
+                    torchaudio.save(
+                        tmp_ref_filename,
+                        torch.clamp(ref_sig[i], min=-0.99, max=0.99),
+                        sample_rate=target_sr,
+                        bits_per_sample=16,
+                        encoding="PCM_S"
+                    )
+                    torchaudio.save(
+                        tmp_deg_filename,
+                        torch.clamp(deg_sig[i], min=-0.99, max=0.99),
+                        sample_rate=target_sr,
+                        bits_per_sample=16,
+                        encoding="PCM_S"
+                    )
+                    csv_writer.writerow([str(tmp_ref_filename), str(tmp_deg_filename)])
+            return tmp_dir, tmp_input_csv_path, tmp_results_csv_path, tmp_debug_json_path
+        except Exception as e:
+            logger.error("Exception occurred when preparing files for ViSQOL: %s", e)
+            return tmp_dir, None, None, None
+    def _flush_files(self, tmp_dir: tp.Union[Path, str]):
+        # flush tmp files used to compute ViSQOL.
+        shutil.rmtree(str(tmp_dir))
+    def _collect_moslqo_score(self, results_csv_path: tp.Union[Path, str]) -> float:
+        # collect results for each evaluated pair and return averaged moslqo score.
+        with open(results_csv_path, "r") as csv_file:
+            reader = csv.DictReader(csv_file)
+            moslqo_scores = [float(row["moslqo"]) for row in reader]
+            if len(moslqo_scores) > 0:
+                return sum(moslqo_scores) / len(moslqo_scores)
+            else:
+                return 0.0
+    def _collect_debug_data(self, debug_json_path: tp.Union[Path, str]) -> dict:
+        # collect debug data for the visqol inference.
+        with open(debug_json_path, "r") as f:
+            data = json.load(f)
+            return data
+    @property
+    def visqol_model(self):
+        return f'{self.visqol_bin}/model/{self.model}'
+    def _run_visqol(
+        self,
+        input_csv_path: tp.Union[Path, str],
+        results_csv_path: tp.Union[Path, str],
+        debug_csv_path: tp.Optional[tp.Union[Path, str]],
+    ):
+        input_csv_path = str(input_csv_path)
+        results_csv_path = str(results_csv_path)
+        debug_csv_path = str(debug_csv_path)
+        cmd = [
+            f'{self.visqol_bin}/bazel-bin/visqol',
+            '--batch_input_csv', f'{input_csv_path}',
+            '--results_csv', f'{results_csv_path}'
+        ]
+        if debug_csv_path is not None:
+            cmd += ['--output_debug', f'{debug_csv_path}']
+        if self.visqol_mode == "speech":
+            cmd += ['--use_speech_mode']
+        cmd += ['--similarity_to_quality_model', f'{self.visqol_model}']
+        result = subprocess.run(cmd, capture_output=True)
+        if result.returncode:
+            logger.error("Error with visqol: \n %s \n %s", result.stdout.decode(), result.stderr.decode())
+            raise RuntimeError("Error while executing visqol")
+        result.check_returncode()
+    def __call__(
+        self,
+        ref_sig: torch.Tensor,
+        deg_sig: torch.Tensor,
+        sr: int,
+        pad_with_silence: bool = False,
+    ):
+        """Calculate the ViSQOL metric for a pair of audio signals at a given sample rate.
+        Args:
+            ref_sig (torch.Tensor): Reference signals as [B, C, T].
+            deg_sig (torch.Tensor): Degraded signals as [B, C, T].
+            sr (int): Sample rate of the two audio signals.
+            pad_with_silence (bool): Whether to pad the file with silences as recommended
+                in visqol guidelines (see: https://github.com/google/visqol#general-guidelines-for-input).
+        Returns:
+            float: The ViSQOL score or mean score for the batch.
+        """
+        logger.debug(f"Calculating visqol with mode={self.visqol_mode} on {len(ref_sig)} samples")
+        tmp_dir, input_csv, results_csv, debug_json = self._prepare_files(
+            ref_sig, deg_sig, sr, self.target_sr, pad_with_silence
+        )
+        try:
+            if input_csv and results_csv:
+                self._run_visqol(
+                    input_csv,
+                    results_csv,
+                    debug_json if self.debug else None,
+                )
+                mosqol = self._collect_moslqo_score(results_csv)
+                return mosqol
+            else:
+                raise RuntimeError("Something unexpected happened when running VISQOL!")
+        except Exception as e:
+            logger.error("Exception occurred when running ViSQOL: %s", e)
+        finally:
+            self._flush_files(tmp_dir)