Spaces:
Sleeping
Sleeping
unpairedelectron07
commited on
Upload 6 files
Browse files- audiocraft/metrics/chroma_cosinesim.py +72 -0
- audiocraft/metrics/clap_consistency.py +84 -0
- audiocraft/metrics/fad.py +329 -0
- audiocraft/metrics/kld.py +220 -0
- audiocraft/metrics/rvm.py +110 -0
- audiocraft/metrics/visqol.py +216 -0
audiocraft/metrics/chroma_cosinesim.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
import torch
|
8 |
+
import torchmetrics
|
9 |
+
|
10 |
+
from ..data.audio_utils import convert_audio
|
11 |
+
from ..modules.chroma import ChromaExtractor
|
12 |
+
|
13 |
+
|
14 |
+
class ChromaCosineSimilarityMetric(torchmetrics.Metric):
|
15 |
+
"""Chroma cosine similarity metric.
|
16 |
+
|
17 |
+
This metric extracts a chromagram for a reference waveform and
|
18 |
+
a generated waveform and compares each frame using the cosine similarity
|
19 |
+
function. The output is the mean cosine similarity.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
sample_rate (int): Sample rate used by the chroma extractor.
|
23 |
+
n_chroma (int): Number of chroma used by the chroma extractor.
|
24 |
+
radix2_exp (int): Exponent for the chroma extractor.
|
25 |
+
argmax (bool): Whether the chroma extractor uses argmax.
|
26 |
+
eps (float): Epsilon for cosine similarity computation.
|
27 |
+
"""
|
28 |
+
def __init__(self, sample_rate: int, n_chroma: int, radix2_exp: int, argmax: bool, eps: float = 1e-8):
|
29 |
+
super().__init__()
|
30 |
+
self.chroma_sample_rate = sample_rate
|
31 |
+
self.n_chroma = n_chroma
|
32 |
+
self.eps = eps
|
33 |
+
self.chroma_extractor = ChromaExtractor(sample_rate=self.chroma_sample_rate, n_chroma=self.n_chroma,
|
34 |
+
radix2_exp=radix2_exp, argmax=argmax)
|
35 |
+
self.add_state("cosine_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
|
36 |
+
self.add_state("weight", default=torch.tensor(0.), dist_reduce_fx="sum")
|
37 |
+
|
38 |
+
def update(self, preds: torch.Tensor, targets: torch.Tensor,
|
39 |
+
sizes: torch.Tensor, sample_rates: torch.Tensor) -> None:
|
40 |
+
"""Compute cosine similarity between chromagrams and accumulate scores over the dataset."""
|
41 |
+
if preds.size(0) == 0:
|
42 |
+
return
|
43 |
+
|
44 |
+
assert preds.shape == targets.shape, (
|
45 |
+
f"Preds and target shapes mismatch: preds={preds.shape}, targets={targets.shape}")
|
46 |
+
assert preds.size(0) == sizes.size(0), (
|
47 |
+
f"Number of items in preds ({preds.shape}) mismatch ",
|
48 |
+
f"with sizes ({sizes.shape})")
|
49 |
+
assert preds.size(0) == sample_rates.size(0), (
|
50 |
+
f"Number of items in preds ({preds.shape}) mismatch ",
|
51 |
+
f"with sample_rates ({sample_rates.shape})")
|
52 |
+
assert torch.all(sample_rates == sample_rates[0].item()), "All sample rates are not the same in the batch"
|
53 |
+
|
54 |
+
device = self.weight.device
|
55 |
+
preds, targets = preds.to(device), targets.to(device) # type: ignore
|
56 |
+
sample_rate = sample_rates[0].item()
|
57 |
+
preds = convert_audio(preds, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
|
58 |
+
targets = convert_audio(targets, from_rate=sample_rate, to_rate=self.chroma_sample_rate, to_channels=1)
|
59 |
+
gt_chroma = self.chroma_extractor(targets)
|
60 |
+
gen_chroma = self.chroma_extractor(preds)
|
61 |
+
chroma_lens = (sizes / self.chroma_extractor.winhop).ceil().int()
|
62 |
+
for i in range(len(gt_chroma)):
|
63 |
+
t = int(chroma_lens[i].item())
|
64 |
+
cosine_sim = torch.nn.functional.cosine_similarity(
|
65 |
+
gt_chroma[i, :t], gen_chroma[i, :t], dim=1, eps=self.eps)
|
66 |
+
self.cosine_sum += cosine_sim.sum(dim=0) # type: ignore
|
67 |
+
self.weight += torch.tensor(t) # type: ignore
|
68 |
+
|
69 |
+
def compute(self) -> float:
|
70 |
+
"""Computes the average cosine similarty across all generated/target chromagrams pairs."""
|
71 |
+
assert self.weight.item() > 0, "Unable to compute with total number of comparisons <= 0" # type: ignore
|
72 |
+
return (self.cosine_sum / self.weight).item() # type: ignore
|
audiocraft/metrics/clap_consistency.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
from pathlib import Path
|
8 |
+
import typing as tp
|
9 |
+
|
10 |
+
import torch
|
11 |
+
import torchmetrics
|
12 |
+
from transformers import RobertaTokenizer # type: ignore
|
13 |
+
|
14 |
+
from ..data.audio_utils import convert_audio
|
15 |
+
from ..environment import AudioCraftEnvironment
|
16 |
+
from ..utils.utils import load_clap_state_dict
|
17 |
+
|
18 |
+
try:
|
19 |
+
import laion_clap # type: ignore
|
20 |
+
except ImportError:
|
21 |
+
laion_clap = None
|
22 |
+
|
23 |
+
|
24 |
+
class TextConsistencyMetric(torchmetrics.Metric):
|
25 |
+
"""Text consistency metric measuring consistency between audio and text pairs."""
|
26 |
+
|
27 |
+
def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -> None:
|
28 |
+
raise NotImplementedError("implement how to update the metric from the audio and text pairs.")
|
29 |
+
|
30 |
+
def compute(self):
|
31 |
+
raise NotImplementedError("implement how to compute the final metric score.")
|
32 |
+
|
33 |
+
|
34 |
+
class CLAPTextConsistencyMetric(TextConsistencyMetric):
|
35 |
+
"""Text consistency metric relying on Contrastive Language-Audio Pretraining (CLAP).
|
36 |
+
|
37 |
+
This metric is similar to the MuLan Cycle Consistency from MusicLM (https://arxiv.org/pdf/2301.11325.pdf)
|
38 |
+
or the CLAP score used in Make-An-Audio (https://arxiv.org/pdf/2301.12661v1.pdf).
|
39 |
+
|
40 |
+
As a joint audio-text embedding model, a pretrained CLAP model can be used to quantify the
|
41 |
+
similarity between audio-text pairs. We compute the CLAP embeddings from the text descriptions as
|
42 |
+
well as the generated audio based on them, and define the MCC metric as the average cosine similarity
|
43 |
+
between these embeddings.
|
44 |
+
|
45 |
+
Model implementation & pre-trained checkpoints: https://github.com/LAION-AI/CLAP
|
46 |
+
"""
|
47 |
+
def __init__(self, model_path: tp.Union[str, Path], model_arch: str = 'HTSAT-tiny', enable_fusion: bool = False):
|
48 |
+
super().__init__()
|
49 |
+
if laion_clap is None:
|
50 |
+
raise ImportError("Please install CLAP to compute text consistency: 'pip install laion_clap'")
|
51 |
+
self.add_state("cosine_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
|
52 |
+
self.add_state("weight", default=torch.tensor(0.), dist_reduce_fx="sum")
|
53 |
+
self._initialize_model(model_path, model_arch, enable_fusion)
|
54 |
+
|
55 |
+
def _initialize_model(self, model_path: tp.Union[str, Path], model_arch: str, enable_fusion: bool):
|
56 |
+
model_path = AudioCraftEnvironment.resolve_reference_path(model_path)
|
57 |
+
self.tokenize = RobertaTokenizer.from_pretrained('roberta-base')
|
58 |
+
self.model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
|
59 |
+
self.model_sample_rate = 48_000
|
60 |
+
load_clap_state_dict(self.model, model_path)
|
61 |
+
self.model.eval()
|
62 |
+
|
63 |
+
def _tokenizer(self, texts: tp.Union[str, tp.List[str]]) -> dict:
|
64 |
+
# we use the default params from CLAP module here as well
|
65 |
+
return self.tokenize(texts, padding="max_length", truncation=True, max_length=77, return_tensors="pt")
|
66 |
+
|
67 |
+
def update(self, audio: torch.Tensor, text: tp.List[str], sizes: torch.Tensor, sample_rates: torch.Tensor) -> None:
|
68 |
+
"""Compute cosine similarity between audio and text pairs and accumulate scores over the dataset."""
|
69 |
+
assert audio.size(0) == len(text), "Number of audio and text samples should match"
|
70 |
+
assert torch.all(sample_rates == sample_rates[0].item()), "All items in batch should have the same sample rate"
|
71 |
+
sample_rate = int(sample_rates[0].item())
|
72 |
+
# convert audio batch to 48kHz monophonic audio with no channel dimension: [B, C, T] -> [B, T]
|
73 |
+
audio = convert_audio(audio, from_rate=sample_rate, to_rate=self.model_sample_rate, to_channels=1).mean(dim=1)
|
74 |
+
audio_embeddings = self.model.get_audio_embedding_from_data(audio, use_tensor=True)
|
75 |
+
text_embeddings = self.model.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
|
76 |
+
# cosine similarity between the text and the audio embedding
|
77 |
+
cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_embeddings, dim=1, eps=1e-8)
|
78 |
+
self.cosine_sum += cosine_sim.sum(dim=0)
|
79 |
+
self.weight += torch.tensor(cosine_sim.size(0))
|
80 |
+
|
81 |
+
def compute(self):
|
82 |
+
"""Computes the average cosine similarty across all audio/text pairs."""
|
83 |
+
assert self.weight.item() > 0, "Unable to compute with total number of comparisons <= 0" # type: ignore
|
84 |
+
return (self.cosine_sum / self.weight).item() # type: ignore
|
audiocraft/metrics/fad.py
ADDED
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
import logging
|
8 |
+
from pathlib import Path
|
9 |
+
import os
|
10 |
+
import subprocess
|
11 |
+
import tempfile
|
12 |
+
import typing as tp
|
13 |
+
|
14 |
+
from audiocraft.data.audio import audio_write
|
15 |
+
from audiocraft.data.audio_utils import convert_audio
|
16 |
+
import flashy
|
17 |
+
import torch
|
18 |
+
import torchmetrics
|
19 |
+
|
20 |
+
from ..environment import AudioCraftEnvironment
|
21 |
+
|
22 |
+
|
23 |
+
logger = logging.getLogger(__name__)
|
24 |
+
|
25 |
+
VGGISH_SAMPLE_RATE = 16_000
|
26 |
+
VGGISH_CHANNELS = 1
|
27 |
+
|
28 |
+
|
29 |
+
class FrechetAudioDistanceMetric(torchmetrics.Metric):
|
30 |
+
"""Fréchet Audio Distance computation based on official TensorFlow implementation from Google Research.
|
31 |
+
|
32 |
+
From: D.C. Dowson & B.V. Landau The Fréchet distance between
|
33 |
+
multivariate normal distributions
|
34 |
+
https://doi.org/10.1016/0047-259X(82)90077-X
|
35 |
+
The Fréchet distance between two multivariate gaussians,
|
36 |
+
`X ~ N(mu_x, sigma_x)` and `Y ~ N(mu_y, sigma_y)`, is `d^2`.
|
37 |
+
d^2 = (mu_x - mu_y)^2 + Tr(sigma_x + sigma_y - 2 * sqrt(sigma_x*sigma_y))
|
38 |
+
= (mu_x - mu_y)^2 + Tr(sigma_x) + Tr(sigma_y)
|
39 |
+
- 2 * Tr(sqrt(sigma_x*sigma_y)))
|
40 |
+
|
41 |
+
To use this FAD computation metric, you need to have the proper Frechet Audio Distance tool setup
|
42 |
+
from: https://github.com/google-research/google-research/tree/master/frechet_audio_distance
|
43 |
+
We provide the below instructions as reference but we do not guarantee for further support
|
44 |
+
in frechet_audio_distance installation. This was tested with python 3.10, cuda 11.8, tensorflow 2.12.0.
|
45 |
+
|
46 |
+
We recommend installing the frechet_audio_distance library in a dedicated env (e.g. conda).
|
47 |
+
|
48 |
+
1. Get the code and models following the repository instructions. We used the steps below:
|
49 |
+
git clone git@github.com:google-research/google-research.git
|
50 |
+
git clone git@github.com:tensorflow/models.git
|
51 |
+
mkdir google-research/tensorflow_models
|
52 |
+
touch google-research/tensorflow_models/__init__.py
|
53 |
+
cp -r models/research/audioset google-research/tensorflow_models/
|
54 |
+
touch google-research/tensorflow_models/audioset/__init__.py
|
55 |
+
echo "from .vggish import mel_features, vggish_params, vggish_slim" > \
|
56 |
+
google-research/tensorflow_models/audioset/__init__.py
|
57 |
+
# we can now remove the tensorflow models repository
|
58 |
+
# rm -r models
|
59 |
+
cd google-research
|
60 |
+
Follow the instructions to download the vggish checkpoint. AudioCraft base configuration
|
61 |
+
assumes it is placed in the AudioCraft reference dir.
|
62 |
+
|
63 |
+
Note that we operate the following changes for the code to work with TensorFlow 2.X and python 3:
|
64 |
+
- Update xrange for range in:
|
65 |
+
https://github.com/google-research/google-research/blob/master/frechet_audio_distance/audioset_model.py
|
66 |
+
- Update `tf_record = tf.python_io.tf_record_iterator(filename).next()` to
|
67 |
+
`tf_record = tf.python_io.tf_record_iterator(filename).__next__()` in
|
68 |
+
https://github.com/google-research/google-research/blob/master/frechet_audio_distance/fad_utils.py
|
69 |
+
- Update `import vggish_params as params` to `from . import vggish_params as params` in:
|
70 |
+
https://github.com/tensorflow/models/blob/master/research/audioset/vggish/vggish_slim.py
|
71 |
+
- Add flag to provide a given batch size for running the AudioSet model in:
|
72 |
+
https://github.com/google-research/google-research/blob/master/frechet_audio_distance/create_embeddings_main.py
|
73 |
+
```
|
74 |
+
flags.DEFINE_integer('batch_size', 64,
|
75 |
+
'Number of samples in the batch for AudioSet model.')
|
76 |
+
```
|
77 |
+
Ensure you pass the flag to the create_embeddings_beam.create_pipeline function, adding:
|
78 |
+
`batch_size=FLAGS.batch_size` to the provided parameters.
|
79 |
+
|
80 |
+
2. Follow instructions for the library installation and a valid TensorFlow installation
|
81 |
+
```
|
82 |
+
# e.g. instructions from: https://www.tensorflow.org/install/pip
|
83 |
+
conda install -c conda-forge cudatoolkit=11.8.0
|
84 |
+
python3 -m pip install nvidia-cudnn-cu11==8.6.0.163 tensorflow==2.12.*
|
85 |
+
mkdir -p $CONDA_PREFIX/etc/conda/activate.d
|
86 |
+
echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' \
|
87 |
+
>> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
|
88 |
+
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$CUDNN_PATH/lib' \
|
89 |
+
>> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
|
90 |
+
source $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
|
91 |
+
# Verify install: on a machine with GPU device
|
92 |
+
python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
|
93 |
+
```
|
94 |
+
|
95 |
+
Now install frechet_audio_distance required dependencies:
|
96 |
+
```
|
97 |
+
# We assume we already have TensorFlow installed from the above steps
|
98 |
+
pip install apache-beam numpy scipy tf_slim
|
99 |
+
```
|
100 |
+
|
101 |
+
Finally, follow remaining library instructions to ensure you have a working frechet_audio_distance setup
|
102 |
+
(you may want to specify --model_ckpt flag pointing to the model's path).
|
103 |
+
|
104 |
+
3. AudioCraft's FrechetAudioDistanceMetric requires 2 environment variables pointing to the python executable
|
105 |
+
and Tensorflow library path from the above installation steps:
|
106 |
+
export TF_PYTHON_EXE="<PATH_TO_THE_ENV_PYTHON_BINARY>"
|
107 |
+
export TF_LIBRARY_PATH="<PATH_TO_THE_ENV_CUDNN_LIBRARY>"
|
108 |
+
|
109 |
+
e.g. assuming we have installed everything in a dedicated conda env
|
110 |
+
with python 3.10 that is currently active:
|
111 |
+
export TF_PYTHON_EXE="$CONDA_PREFIX/bin/python"
|
112 |
+
export TF_LIBRARY_PATH="$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/cudnn/lib"
|
113 |
+
|
114 |
+
Finally you may want to export the following variable:
|
115 |
+
export TF_FORCE_GPU_ALLOW_GROWTH=true
|
116 |
+
See: https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth
|
117 |
+
|
118 |
+
You can save those environment variables in your training conda env, when currently active:
|
119 |
+
`$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh`
|
120 |
+
e.g. assuming the env with TensorFlow and frechet_audio_distance install is named ac_eval,
|
121 |
+
and the training conda env is named audiocraft:
|
122 |
+
```
|
123 |
+
# activate training env
|
124 |
+
conda activate audiocraft
|
125 |
+
# get path to all envs
|
126 |
+
CONDA_ENV_DIR=$(dirname $CONDA_PREFIX)
|
127 |
+
# export pointers to evaluation env for using TensorFlow in FrechetAudioDistanceMetric
|
128 |
+
touch $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
|
129 |
+
echo 'export TF_PYTHON_EXE="$CONDA_ENV_DIR/ac_eval/bin/python"' >> \
|
130 |
+
$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
|
131 |
+
echo 'export TF_LIBRARY_PATH="$CONDA_ENV_DIR/ac_eval/lib/python3.10/site-packages/nvidia/cudnn/lib"' >> \
|
132 |
+
$CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
|
133 |
+
# optionally:
|
134 |
+
echo 'export TF_FORCE_GPU_ALLOW_GROWTH=true' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
|
135 |
+
# you may need to reactivate the audiocraft env for this to take effect
|
136 |
+
```
|
137 |
+
|
138 |
+
Args:
|
139 |
+
bin (Path or str): Path to installed frechet audio distance code.
|
140 |
+
model_path (Path or str): Path to Tensorflow checkpoint for the model
|
141 |
+
used to compute statistics over the embedding beams.
|
142 |
+
format (str): Audio format used to save files.
|
143 |
+
log_folder (Path or str, optional): Path where to write process logs.
|
144 |
+
"""
|
145 |
+
def __init__(self, bin: tp.Union[Path, str], model_path: tp.Union[Path, str],
|
146 |
+
format: str = "wav", batch_size: tp.Optional[int] = None,
|
147 |
+
log_folder: tp.Optional[tp.Union[Path, str]] = None):
|
148 |
+
super().__init__()
|
149 |
+
self.model_sample_rate = VGGISH_SAMPLE_RATE
|
150 |
+
self.model_channels = VGGISH_CHANNELS
|
151 |
+
self.model_path = AudioCraftEnvironment.resolve_reference_path(model_path)
|
152 |
+
assert Path(self.model_path).exists(), f"Could not find provided model checkpoint path at: {self.model_path}"
|
153 |
+
self.format = format
|
154 |
+
self.batch_size = batch_size
|
155 |
+
self.bin = bin
|
156 |
+
self.tf_env = {"PYTHONPATH": str(self.bin)}
|
157 |
+
self.python_path = os.environ.get('TF_PYTHON_EXE') or 'python'
|
158 |
+
logger.info("Python exe for TF is %s", self.python_path)
|
159 |
+
if 'TF_LIBRARY_PATH' in os.environ:
|
160 |
+
self.tf_env['LD_LIBRARY_PATH'] = os.environ['TF_LIBRARY_PATH']
|
161 |
+
if 'TF_FORCE_GPU_ALLOW_GROWTH' in os.environ:
|
162 |
+
self.tf_env['TF_FORCE_GPU_ALLOW_GROWTH'] = os.environ['TF_FORCE_GPU_ALLOW_GROWTH']
|
163 |
+
logger.info("Env for TF is %r", self.tf_env)
|
164 |
+
self.reset(log_folder)
|
165 |
+
self.add_state("total_files", default=torch.tensor(0.), dist_reduce_fx="sum")
|
166 |
+
|
167 |
+
def reset(self, log_folder: tp.Optional[tp.Union[Path, str]] = None):
|
168 |
+
"""Reset torchmetrics.Metrics state."""
|
169 |
+
log_folder = Path(log_folder or tempfile.mkdtemp())
|
170 |
+
self.tmp_dir = log_folder / 'fad'
|
171 |
+
self.tmp_dir.mkdir(exist_ok=True)
|
172 |
+
self.samples_tests_dir = self.tmp_dir / 'tests'
|
173 |
+
self.samples_tests_dir.mkdir(exist_ok=True)
|
174 |
+
self.samples_background_dir = self.tmp_dir / 'background'
|
175 |
+
self.samples_background_dir.mkdir(exist_ok=True)
|
176 |
+
self.manifest_tests = self.tmp_dir / 'files_tests.cvs'
|
177 |
+
self.manifest_background = self.tmp_dir / 'files_background.cvs'
|
178 |
+
self.stats_tests_dir = self.tmp_dir / 'stats_tests'
|
179 |
+
self.stats_background_dir = self.tmp_dir / 'stats_background'
|
180 |
+
self.counter = 0
|
181 |
+
|
182 |
+
def update(self, preds: torch.Tensor, targets: torch.Tensor,
|
183 |
+
sizes: torch.Tensor, sample_rates: torch.Tensor,
|
184 |
+
stems: tp.Optional[tp.List[str]] = None):
|
185 |
+
"""Update torchmetrics.Metrics by saving the audio and updating the manifest file."""
|
186 |
+
assert preds.shape == targets.shape, f"preds={preds.shape} != targets={targets.shape}"
|
187 |
+
num_samples = preds.shape[0]
|
188 |
+
assert num_samples == sizes.size(0) and num_samples == sample_rates.size(0)
|
189 |
+
assert stems is None or num_samples == len(set(stems))
|
190 |
+
for i in range(num_samples):
|
191 |
+
self.total_files += 1 # type: ignore
|
192 |
+
self.counter += 1
|
193 |
+
wav_len = int(sizes[i].item())
|
194 |
+
sample_rate = int(sample_rates[i].item())
|
195 |
+
pred_wav = preds[i]
|
196 |
+
target_wav = targets[i]
|
197 |
+
pred_wav = pred_wav[..., :wav_len]
|
198 |
+
target_wav = target_wav[..., :wav_len]
|
199 |
+
stem_name = stems[i] if stems is not None else f'sample_{self.counter}_{flashy.distrib.rank()}'
|
200 |
+
# dump audio files
|
201 |
+
try:
|
202 |
+
pred_wav = convert_audio(
|
203 |
+
pred_wav.unsqueeze(0), from_rate=sample_rate,
|
204 |
+
to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
|
205 |
+
audio_write(
|
206 |
+
self.samples_tests_dir / stem_name, pred_wav, sample_rate=self.model_sample_rate,
|
207 |
+
format=self.format, strategy="peak")
|
208 |
+
except Exception as e:
|
209 |
+
logger.error(f"Exception occured when saving tests files for FAD computation: {repr(e)} - {e}")
|
210 |
+
try:
|
211 |
+
# for the ground truth audio, we enforce the 'peak' strategy to avoid modifying
|
212 |
+
# the original audio when writing it
|
213 |
+
target_wav = convert_audio(
|
214 |
+
target_wav.unsqueeze(0), from_rate=sample_rate,
|
215 |
+
to_rate=self.model_sample_rate, to_channels=1).squeeze(0)
|
216 |
+
audio_write(
|
217 |
+
self.samples_background_dir / stem_name, target_wav, sample_rate=self.model_sample_rate,
|
218 |
+
format=self.format, strategy="peak")
|
219 |
+
except Exception as e:
|
220 |
+
logger.error(f"Exception occured when saving background files for FAD computation: {repr(e)} - {e}")
|
221 |
+
|
222 |
+
def _get_samples_name(self, is_background: bool):
|
223 |
+
return 'background' if is_background else 'tests'
|
224 |
+
|
225 |
+
def _create_embedding_beams(self, is_background: bool, gpu_index: tp.Optional[int] = None):
|
226 |
+
if is_background:
|
227 |
+
input_samples_dir = self.samples_background_dir
|
228 |
+
input_filename = self.manifest_background
|
229 |
+
stats_name = self.stats_background_dir
|
230 |
+
else:
|
231 |
+
input_samples_dir = self.samples_tests_dir
|
232 |
+
input_filename = self.manifest_tests
|
233 |
+
stats_name = self.stats_tests_dir
|
234 |
+
beams_name = self._get_samples_name(is_background)
|
235 |
+
log_file = self.tmp_dir / f'fad_logs_create_beams_{beams_name}.log'
|
236 |
+
|
237 |
+
logger.info(f"Scanning samples folder to fetch list of files: {input_samples_dir}")
|
238 |
+
with open(input_filename, "w") as fout:
|
239 |
+
for path in Path(input_samples_dir).glob(f"*.{self.format}"):
|
240 |
+
fout.write(f"{str(path)}\n")
|
241 |
+
|
242 |
+
cmd = [
|
243 |
+
self.python_path, "-m",
|
244 |
+
"frechet_audio_distance.create_embeddings_main",
|
245 |
+
"--model_ckpt", f"{self.model_path}",
|
246 |
+
"--input_files", f"{str(input_filename)}",
|
247 |
+
"--stats", f"{str(stats_name)}",
|
248 |
+
]
|
249 |
+
if self.batch_size is not None:
|
250 |
+
cmd += ["--batch_size", str(self.batch_size)]
|
251 |
+
logger.info(f"Launching frechet_audio_distance embeddings main method: {' '.join(cmd)} on {beams_name}")
|
252 |
+
env = os.environ
|
253 |
+
if gpu_index is not None:
|
254 |
+
env["CUDA_VISIBLE_DEVICES"] = str(gpu_index)
|
255 |
+
process = subprocess.Popen(
|
256 |
+
cmd, stdout=open(log_file, "w"), env={**env, **self.tf_env}, stderr=subprocess.STDOUT)
|
257 |
+
return process, log_file
|
258 |
+
|
259 |
+
def _compute_fad_score(self, gpu_index: tp.Optional[int] = None):
|
260 |
+
cmd = [
|
261 |
+
self.python_path, "-m", "frechet_audio_distance.compute_fad",
|
262 |
+
"--test_stats", f"{str(self.stats_tests_dir)}",
|
263 |
+
"--background_stats", f"{str(self.stats_background_dir)}",
|
264 |
+
]
|
265 |
+
logger.info(f"Launching frechet_audio_distance compute fad method: {' '.join(cmd)}")
|
266 |
+
env = os.environ
|
267 |
+
if gpu_index is not None:
|
268 |
+
env["CUDA_VISIBLE_DEVICES"] = str(gpu_index)
|
269 |
+
result = subprocess.run(cmd, env={**env, **self.tf_env}, capture_output=True)
|
270 |
+
if result.returncode:
|
271 |
+
logger.error(
|
272 |
+
"Error with FAD computation from stats: \n %s \n %s",
|
273 |
+
result.stdout.decode(), result.stderr.decode()
|
274 |
+
)
|
275 |
+
raise RuntimeError("Error while executing FAD computation from stats")
|
276 |
+
try:
|
277 |
+
# result is "FAD: (d+).(d+)" hence we remove the prefix with (d+) being one digit or more
|
278 |
+
fad_score = float(result.stdout[4:])
|
279 |
+
return fad_score
|
280 |
+
except Exception as e:
|
281 |
+
raise RuntimeError(f"Error parsing FAD score from command stdout: {e}")
|
282 |
+
|
283 |
+
def _log_process_result(self, returncode: int, log_file: tp.Union[Path, str], is_background: bool) -> None:
|
284 |
+
beams_name = self._get_samples_name(is_background)
|
285 |
+
if returncode:
|
286 |
+
with open(log_file, "r") as f:
|
287 |
+
error_log = f.read()
|
288 |
+
logger.error(error_log)
|
289 |
+
os._exit(1)
|
290 |
+
else:
|
291 |
+
logger.info(f"Successfully computed embedding beams on {beams_name} samples.")
|
292 |
+
|
293 |
+
def _parallel_create_embedding_beams(self, num_of_gpus: int):
|
294 |
+
assert num_of_gpus > 0
|
295 |
+
logger.info("Creating embeddings beams in a parallel manner on different GPUs")
|
296 |
+
tests_beams_process, tests_beams_log_file = self._create_embedding_beams(is_background=False, gpu_index=0)
|
297 |
+
bg_beams_process, bg_beams_log_file = self._create_embedding_beams(is_background=True, gpu_index=1)
|
298 |
+
tests_beams_code = tests_beams_process.wait()
|
299 |
+
bg_beams_code = bg_beams_process.wait()
|
300 |
+
self._log_process_result(tests_beams_code, tests_beams_log_file, is_background=False)
|
301 |
+
self._log_process_result(bg_beams_code, bg_beams_log_file, is_background=True)
|
302 |
+
|
303 |
+
def _sequential_create_embedding_beams(self):
|
304 |
+
logger.info("Creating embeddings beams in a sequential manner")
|
305 |
+
tests_beams_process, tests_beams_log_file = self._create_embedding_beams(is_background=False)
|
306 |
+
tests_beams_code = tests_beams_process.wait()
|
307 |
+
self._log_process_result(tests_beams_code, tests_beams_log_file, is_background=False)
|
308 |
+
bg_beams_process, bg_beams_log_file = self._create_embedding_beams(is_background=True)
|
309 |
+
bg_beams_code = bg_beams_process.wait()
|
310 |
+
self._log_process_result(bg_beams_code, bg_beams_log_file, is_background=True)
|
311 |
+
|
312 |
+
@flashy.distrib.rank_zero_only
|
313 |
+
def _local_compute_frechet_audio_distance(self):
|
314 |
+
"""Compute Frechet Audio Distance score calling TensorFlow API."""
|
315 |
+
num_of_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
|
316 |
+
if num_of_gpus > 1:
|
317 |
+
self._parallel_create_embedding_beams(num_of_gpus)
|
318 |
+
else:
|
319 |
+
self._sequential_create_embedding_beams()
|
320 |
+
fad_score = self._compute_fad_score(gpu_index=0)
|
321 |
+
return fad_score
|
322 |
+
|
323 |
+
def compute(self) -> float:
|
324 |
+
"""Compute metrics."""
|
325 |
+
assert self.total_files.item() > 0, "No files dumped for FAD computation!" # type: ignore
|
326 |
+
fad_score = self._local_compute_frechet_audio_distance()
|
327 |
+
logger.warning(f"FAD score = {fad_score}")
|
328 |
+
fad_score = flashy.distrib.broadcast_object(fad_score, src=0)
|
329 |
+
return fad_score
|
audiocraft/metrics/kld.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
import contextlib
|
8 |
+
from functools import partial
|
9 |
+
import logging
|
10 |
+
import os
|
11 |
+
import typing as tp
|
12 |
+
|
13 |
+
import torch
|
14 |
+
import torchmetrics
|
15 |
+
|
16 |
+
from ..data.audio_utils import convert_audio
|
17 |
+
|
18 |
+
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
class _patch_passt_stft:
|
23 |
+
"""Decorator to patch torch.stft in PaSST."""
|
24 |
+
def __init__(self):
|
25 |
+
self.old_stft = torch.stft
|
26 |
+
|
27 |
+
def __enter__(self):
|
28 |
+
# return_complex is a mandatory parameter in latest torch versions
|
29 |
+
# torch is throwing RuntimeErrors when not set
|
30 |
+
torch.stft = partial(torch.stft, return_complex=False)
|
31 |
+
|
32 |
+
def __exit__(self, *exc):
|
33 |
+
torch.stft = self.old_stft
|
34 |
+
|
35 |
+
|
36 |
+
def kl_divergence(pred_probs: torch.Tensor, target_probs: torch.Tensor, epsilon: float = 1e-6) -> torch.Tensor:
|
37 |
+
"""Computes the elementwise KL-Divergence loss between probability distributions
|
38 |
+
from generated samples and target samples.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
pred_probs (torch.Tensor): Probabilities for each label obtained
|
42 |
+
from a classifier on generated audio. Expected shape is [B, num_classes].
|
43 |
+
target_probs (torch.Tensor): Probabilities for each label obtained
|
44 |
+
from a classifier on target audio. Expected shape is [B, num_classes].
|
45 |
+
epsilon (float): Epsilon value.
|
46 |
+
Returns:
|
47 |
+
kld (torch.Tensor): KLD loss between each generated sample and target pair.
|
48 |
+
"""
|
49 |
+
kl_div = torch.nn.functional.kl_div((pred_probs + epsilon).log(), target_probs, reduction="none")
|
50 |
+
return kl_div.sum(-1)
|
51 |
+
|
52 |
+
|
53 |
+
class KLDivergenceMetric(torchmetrics.Metric):
|
54 |
+
"""Base implementation for KL Divergence metric.
|
55 |
+
|
56 |
+
The KL divergence is measured between probability distributions
|
57 |
+
of class predictions returned by a pre-trained audio classification model.
|
58 |
+
When the KL-divergence is low, the generated audio is expected to
|
59 |
+
have similar acoustic characteristics as the reference audio,
|
60 |
+
according to the classifier.
|
61 |
+
"""
|
62 |
+
def __init__(self):
|
63 |
+
super().__init__()
|
64 |
+
self.add_state("kld_pq_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
|
65 |
+
self.add_state("kld_qp_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
|
66 |
+
self.add_state("kld_all_sum", default=torch.tensor(0.), dist_reduce_fx="sum")
|
67 |
+
self.add_state("weight", default=torch.tensor(0), dist_reduce_fx="sum")
|
68 |
+
|
69 |
+
def _get_label_distribution(self, x: torch.Tensor, sizes: torch.Tensor,
|
70 |
+
sample_rates: torch.Tensor) -> tp.Optional[torch.Tensor]:
|
71 |
+
"""Get model output given provided input tensor.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
x (torch.Tensor): Input audio tensor of shape [B, C, T].
|
75 |
+
sizes (torch.Tensor): Actual audio sample length, of shape [B].
|
76 |
+
sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
|
77 |
+
Returns:
|
78 |
+
probs (torch.Tensor): Probabilities over labels, of shape [B, num_classes].
|
79 |
+
"""
|
80 |
+
raise NotImplementedError("implement method to extract label distributions from the model.")
|
81 |
+
|
82 |
+
def update(self, preds: torch.Tensor, targets: torch.Tensor,
|
83 |
+
sizes: torch.Tensor, sample_rates: torch.Tensor) -> None:
|
84 |
+
"""Calculates running KL-Divergence loss between batches of audio
|
85 |
+
preds (generated) and target (ground-truth)
|
86 |
+
Args:
|
87 |
+
preds (torch.Tensor): Audio samples to evaluate, of shape [B, C, T].
|
88 |
+
targets (torch.Tensor): Target samples to compare against, of shape [B, C, T].
|
89 |
+
sizes (torch.Tensor): Actual audio sample length, of shape [B].
|
90 |
+
sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
|
91 |
+
"""
|
92 |
+
assert preds.shape == targets.shape
|
93 |
+
assert preds.size(0) > 0, "Cannot update the loss with empty tensors"
|
94 |
+
preds_probs = self._get_label_distribution(preds, sizes, sample_rates)
|
95 |
+
targets_probs = self._get_label_distribution(targets, sizes, sample_rates)
|
96 |
+
if preds_probs is not None and targets_probs is not None:
|
97 |
+
assert preds_probs.shape == targets_probs.shape
|
98 |
+
kld_scores = kl_divergence(preds_probs, targets_probs)
|
99 |
+
assert not torch.isnan(kld_scores).any(), "kld_scores contains NaN value(s)!"
|
100 |
+
self.kld_pq_sum += torch.sum(kld_scores)
|
101 |
+
kld_qp_scores = kl_divergence(targets_probs, preds_probs)
|
102 |
+
self.kld_qp_sum += torch.sum(kld_qp_scores)
|
103 |
+
self.weight += torch.tensor(kld_scores.size(0))
|
104 |
+
|
105 |
+
def compute(self) -> dict:
|
106 |
+
"""Computes KL-Divergence across all evaluated pred/target pairs."""
|
107 |
+
weight: float = float(self.weight.item()) # type: ignore
|
108 |
+
assert weight > 0, "Unable to compute with total number of comparisons <= 0"
|
109 |
+
logger.info(f"Computing KL divergence on a total of {weight} samples")
|
110 |
+
kld_pq = self.kld_pq_sum.item() / weight # type: ignore
|
111 |
+
kld_qp = self.kld_qp_sum.item() / weight # type: ignore
|
112 |
+
kld_both = kld_pq + kld_qp
|
113 |
+
return {'kld': kld_pq, 'kld_pq': kld_pq, 'kld_qp': kld_qp, 'kld_both': kld_both}
|
114 |
+
|
115 |
+
|
116 |
+
class PasstKLDivergenceMetric(KLDivergenceMetric):
|
117 |
+
"""KL-Divergence metric based on pre-trained PASST classifier on AudioSet.
|
118 |
+
|
119 |
+
From: PaSST: Efficient Training of Audio Transformers with Patchout
|
120 |
+
Paper: https://arxiv.org/abs/2110.05069
|
121 |
+
Implementation: https://github.com/kkoutini/PaSST
|
122 |
+
|
123 |
+
Follow instructions from the github repo:
|
124 |
+
```
|
125 |
+
pip install 'git+https://github.com/kkoutini/passt_hear21@0.0.19#egg=hear21passt'
|
126 |
+
```
|
127 |
+
|
128 |
+
Args:
|
129 |
+
pretrained_length (float, optional): Audio duration used for the pretrained model.
|
130 |
+
"""
|
131 |
+
def __init__(self, pretrained_length: tp.Optional[float] = None):
|
132 |
+
super().__init__()
|
133 |
+
self._initialize_model(pretrained_length)
|
134 |
+
|
135 |
+
def _initialize_model(self, pretrained_length: tp.Optional[float] = None):
|
136 |
+
"""Initialize underlying PaSST audio classifier."""
|
137 |
+
model, sr, max_frames, min_frames = self._load_base_model(pretrained_length)
|
138 |
+
self.min_input_frames = min_frames
|
139 |
+
self.max_input_frames = max_frames
|
140 |
+
self.model_sample_rate = sr
|
141 |
+
self.model = model
|
142 |
+
self.model.eval()
|
143 |
+
self.model.to(self.device)
|
144 |
+
|
145 |
+
def _load_base_model(self, pretrained_length: tp.Optional[float]):
|
146 |
+
"""Load pretrained model from PaSST."""
|
147 |
+
try:
|
148 |
+
if pretrained_length == 30:
|
149 |
+
from hear21passt.base30sec import get_basic_model # type: ignore
|
150 |
+
max_duration = 30
|
151 |
+
elif pretrained_length == 20:
|
152 |
+
from hear21passt.base20sec import get_basic_model # type: ignore
|
153 |
+
max_duration = 20
|
154 |
+
else:
|
155 |
+
from hear21passt.base import get_basic_model # type: ignore
|
156 |
+
# Original PASST was trained on AudioSet with 10s-long audio samples
|
157 |
+
max_duration = 10
|
158 |
+
min_duration = 0.15
|
159 |
+
min_duration = 0.15
|
160 |
+
except ModuleNotFoundError:
|
161 |
+
raise ModuleNotFoundError(
|
162 |
+
"Please install hear21passt to compute KL divergence: ",
|
163 |
+
"pip install 'git+https://github.com/kkoutini/passt_hear21@0.0.19#egg=hear21passt'"
|
164 |
+
)
|
165 |
+
model_sample_rate = 32_000
|
166 |
+
max_input_frames = int(max_duration * model_sample_rate)
|
167 |
+
min_input_frames = int(min_duration * model_sample_rate)
|
168 |
+
with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f):
|
169 |
+
model = get_basic_model(mode='logits')
|
170 |
+
return model, model_sample_rate, max_input_frames, min_input_frames
|
171 |
+
|
172 |
+
def _process_audio(self, wav: torch.Tensor, sample_rate: int, wav_len: int) -> tp.List[torch.Tensor]:
|
173 |
+
"""Process audio to feed to the pretrained model."""
|
174 |
+
wav = wav.unsqueeze(0)
|
175 |
+
wav = wav[..., :wav_len]
|
176 |
+
wav = convert_audio(wav, from_rate=sample_rate, to_rate=self.model_sample_rate, to_channels=1)
|
177 |
+
wav = wav.squeeze(0)
|
178 |
+
# we don't pad but return a list of audio segments as this otherwise affects the KLD computation
|
179 |
+
segments = torch.split(wav, self.max_input_frames, dim=-1)
|
180 |
+
valid_segments = []
|
181 |
+
for s in segments:
|
182 |
+
# ignoring too small segments that are breaking the model inference
|
183 |
+
if s.size(-1) > self.min_input_frames:
|
184 |
+
valid_segments.append(s)
|
185 |
+
return [s[None] for s in valid_segments]
|
186 |
+
|
187 |
+
def _get_model_preds(self, wav: torch.Tensor) -> torch.Tensor:
|
188 |
+
"""Run the pretrained model and get the predictions."""
|
189 |
+
assert wav.dim() == 3, f"Unexpected number of dims for preprocessed wav: {wav.shape}"
|
190 |
+
wav = wav.mean(dim=1)
|
191 |
+
# PaSST is printing a lot of garbage that we are not interested in
|
192 |
+
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
|
193 |
+
with torch.no_grad(), _patch_passt_stft():
|
194 |
+
logits = self.model(wav.to(self.device))
|
195 |
+
probs = torch.softmax(logits, dim=-1)
|
196 |
+
return probs
|
197 |
+
|
198 |
+
def _get_label_distribution(self, x: torch.Tensor, sizes: torch.Tensor,
|
199 |
+
sample_rates: torch.Tensor) -> tp.Optional[torch.Tensor]:
|
200 |
+
"""Get model output given provided input tensor.
|
201 |
+
|
202 |
+
Args:
|
203 |
+
x (torch.Tensor): Input audio tensor of shape [B, C, T].
|
204 |
+
sizes (torch.Tensor): Actual audio sample length, of shape [B].
|
205 |
+
sample_rates (torch.Tensor): Actual audio sample rate, of shape [B].
|
206 |
+
Returns:
|
207 |
+
probs (torch.Tensor, optional): Probabilities over labels, of shape [B, num_classes].
|
208 |
+
"""
|
209 |
+
all_probs: tp.List[torch.Tensor] = []
|
210 |
+
for i, wav in enumerate(x):
|
211 |
+
sample_rate = int(sample_rates[i].item())
|
212 |
+
wav_len = int(sizes[i].item())
|
213 |
+
wav_segments = self._process_audio(wav, sample_rate, wav_len)
|
214 |
+
for segment in wav_segments:
|
215 |
+
probs = self._get_model_preds(segment).mean(dim=0)
|
216 |
+
all_probs.append(probs)
|
217 |
+
if len(all_probs) > 0:
|
218 |
+
return torch.stack(all_probs, dim=0)
|
219 |
+
else:
|
220 |
+
return None
|
audiocraft/metrics/rvm.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
import typing as tp
|
8 |
+
import torch
|
9 |
+
from torch import nn
|
10 |
+
import torchaudio
|
11 |
+
|
12 |
+
|
13 |
+
def db_to_scale(volume: tp.Union[float, torch.Tensor]):
|
14 |
+
return 10 ** (volume / 20)
|
15 |
+
|
16 |
+
|
17 |
+
def scale_to_db(scale: torch.Tensor, min_volume: float = -120):
|
18 |
+
min_scale = db_to_scale(min_volume)
|
19 |
+
return 20 * torch.log10(scale.clamp(min=min_scale))
|
20 |
+
|
21 |
+
|
22 |
+
class RelativeVolumeMel(nn.Module):
|
23 |
+
"""Relative volume melspectrogram measure.
|
24 |
+
|
25 |
+
Computes a measure of distance over two mel spectrogram that is interpretable in terms
|
26 |
+
of decibels. Given `x_ref` and `x_est` two waveforms of shape `[*, T]`, it will
|
27 |
+
first renormalize both by the ground truth of `x_ref`.
|
28 |
+
|
29 |
+
..Warning:: This class returns the volume of the distortion at the spectrogram level,
|
30 |
+
e.g. low negative values reflects lower distortion levels. For a SNR (like reported
|
31 |
+
in the MultiBandDiffusion paper), just take `-rvm`.
|
32 |
+
|
33 |
+
Then it computes the mel spectrogram `z_ref` and `z_est` and compute volume of the difference
|
34 |
+
relative to the volume of `z_ref` for each time-frequency bin. It further adds some limits, e.g.
|
35 |
+
clamping the values between -25 and 25 dB (controlled by `min_relative_volume` and `max_relative_volume`)
|
36 |
+
with the goal of avoiding the loss being dominated by parts where the reference is almost silent.
|
37 |
+
Indeed, volumes in dB can take unbounded values both towards -oo and +oo, which can make the final
|
38 |
+
average metric harder to interpret. Besides, anything below -30 dB of attenuation would sound extremely
|
39 |
+
good (for a neural network output, although sound engineers typically aim for much lower attenuations).
|
40 |
+
Similarly, anything above +30 dB would just be completely missing the target, and there is no point
|
41 |
+
in measuring by exactly how much it missed it. -25, 25 is a more conservative range, but also more
|
42 |
+
in line with what neural nets currently can achieve.
|
43 |
+
|
44 |
+
For instance, a Relative Volume Mel (RVM) score of -10 dB means that on average, the delta between
|
45 |
+
the target and reference mel-spec is 10 dB lower than the reference mel-spec value.
|
46 |
+
|
47 |
+
The metric can be aggregated over a given frequency band in order have different insights for
|
48 |
+
different region of the spectrum. `num_aggregated_bands` controls the number of bands.
|
49 |
+
|
50 |
+
..Warning:: While this function is optimized for interpretability, nothing was done to ensure it
|
51 |
+
is numerically stable when computing its gradient. We thus advise against using it as a training loss.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
sample_rate (int): Sample rate of the input audio.
|
55 |
+
n_mels (int): Number of mel bands to use.
|
56 |
+
n_fft (int): Number of frequency bins for the STFT.
|
57 |
+
hop_length (int): Hop length of the STFT and the mel-spectrogram.
|
58 |
+
min_relative_volume (float): The error `z_ref - z_est` volume is given relative to
|
59 |
+
the volume of `z_ref`. If error is smaller than -25 dB of `z_ref`, then it is clamped.
|
60 |
+
max_relative_volume (float): Same as `min_relative_volume` but clamping if the error is larger than that.
|
61 |
+
max_initial_gain (float): When rescaling the audio at the very beginning, we will limit the gain
|
62 |
+
to that amount, to avoid rescaling near silence. Given in dB.
|
63 |
+
min_activity_volume (float): When computing the reference level from `z_ref`, will clamp low volume
|
64 |
+
bins to that amount. This is effectively our "zero" level for the reference mel-spectrogram,
|
65 |
+
and anything below that will be considered equally.
|
66 |
+
num_aggregated_bands (int): Number of bands to keep when computing the average RVM value.
|
67 |
+
For instance, a value of 3 would give 3 scores, roughly for low, mid and high freqs.
|
68 |
+
"""
|
69 |
+
def __init__(self, sample_rate: int = 24000, n_mels: int = 80, n_fft: int = 512,
|
70 |
+
hop_length: int = 128, min_relative_volume: float = -25,
|
71 |
+
max_relative_volume: float = 25, max_initial_gain: float = 25,
|
72 |
+
min_activity_volume: float = -25,
|
73 |
+
num_aggregated_bands: int = 4) -> None:
|
74 |
+
super().__init__()
|
75 |
+
self.melspec = torchaudio.transforms.MelSpectrogram(
|
76 |
+
n_mels=n_mels, n_fft=n_fft, hop_length=hop_length,
|
77 |
+
normalized=True, sample_rate=sample_rate, power=2)
|
78 |
+
self.min_relative_volume = min_relative_volume
|
79 |
+
self.max_relative_volume = max_relative_volume
|
80 |
+
self.max_initial_gain = max_initial_gain
|
81 |
+
self.min_activity_volume = min_activity_volume
|
82 |
+
self.num_aggregated_bands = num_aggregated_bands
|
83 |
+
|
84 |
+
def forward(self, estimate: torch.Tensor, ground_truth: torch.Tensor) -> tp.Dict[str, torch.Tensor]:
|
85 |
+
"""Compute RVM metric between estimate and reference samples.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
estimate (torch.Tensor): Estimate sample.
|
89 |
+
ground_truth (torch.Tensor): Reference sample.
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
dict[str, torch.Tensor]: Metrics with keys `rvm` for the overall average, and `rvm_{k}`
|
93 |
+
for the RVM over the k-th band (k=0..num_aggregated_bands - 1).
|
94 |
+
"""
|
95 |
+
min_scale = db_to_scale(-self.max_initial_gain)
|
96 |
+
std = ground_truth.pow(2).mean().sqrt().clamp(min=min_scale)
|
97 |
+
z_gt = self.melspec(ground_truth / std).sqrt()
|
98 |
+
z_est = self.melspec(estimate / std).sqrt()
|
99 |
+
|
100 |
+
delta = z_gt - z_est
|
101 |
+
ref_db = scale_to_db(z_gt, self.min_activity_volume)
|
102 |
+
delta_db = scale_to_db(delta.abs(), min_volume=-120)
|
103 |
+
relative_db = (delta_db - ref_db).clamp(self.min_relative_volume, self.max_relative_volume)
|
104 |
+
dims = list(range(relative_db.dim()))
|
105 |
+
dims.remove(dims[-2])
|
106 |
+
losses_per_band = relative_db.mean(dim=dims)
|
107 |
+
aggregated = [chunk.mean() for chunk in losses_per_band.chunk(self.num_aggregated_bands, dim=0)]
|
108 |
+
metrics = {f'rvm_{index}': value for index, value in enumerate(aggregated)}
|
109 |
+
metrics['rvm'] = losses_per_band.mean()
|
110 |
+
return metrics
|
audiocraft/metrics/visqol.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
import csv
|
8 |
+
import json
|
9 |
+
import logging
|
10 |
+
from pathlib import Path
|
11 |
+
import tempfile
|
12 |
+
import typing as tp
|
13 |
+
import subprocess
|
14 |
+
import shutil
|
15 |
+
|
16 |
+
import torch
|
17 |
+
import torchaudio
|
18 |
+
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
class ViSQOL:
|
23 |
+
"""ViSQOL wrapper to run ViSQOL from Python using a pre-installed binary.
|
24 |
+
|
25 |
+
To learn more about ViSQOL and how to build ViSQOL binary using bazel, please refer to the
|
26 |
+
instructions available in the open source repository: https://github.com/google/visqol
|
27 |
+
|
28 |
+
ViSQOL is capable of running in two modes:
|
29 |
+
|
30 |
+
Audio Mode:
|
31 |
+
When running in audio mode, input signals must have a 48kHz sample rate. Input should be resampled to 48kHz.
|
32 |
+
Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
|
33 |
+
Audio mode uses support vector regression, with the maximum range at ~4.75.
|
34 |
+
|
35 |
+
Speech Mode:
|
36 |
+
When running in speech mode, ViSQOL uses a wideband model. It therefore expects input sample rates of 16kHz.
|
37 |
+
Input should be resampled to 16kHz.
|
38 |
+
As part of the speech mode processing, a root mean square implementation for voice activity detection
|
39 |
+
is performed on the reference signal to determine what parts of the signal have voice activity and
|
40 |
+
should therefore be included in the comparison. The signal is normalized before performing the voice
|
41 |
+
activity detection.
|
42 |
+
Input signals can be multi-channel, but they will be down-mixed to mono for performing the comparison.
|
43 |
+
Speech mode is scaled to have a maximum MOS of 5.0 to match previous version behavior.
|
44 |
+
|
45 |
+
For more details, check the guidelines: https://github.com/google/visqol#general-guidelines-for-input
|
46 |
+
|
47 |
+
Args:
|
48 |
+
visqol_bin (str): Path to the ViSQOL binary.
|
49 |
+
mode (str): ViSQOL computation mode, expecting "audio" or "speech".
|
50 |
+
model (str): Name of the model to use for similarity to quality model.
|
51 |
+
debug (bool): Whether to also get debug metrics from ViSQOL or not.
|
52 |
+
"""
|
53 |
+
SAMPLE_RATES_MODES = {"audio": 48_000, "speech": 16_000}
|
54 |
+
ALLOWED_SAMPLE_RATES = frozenset(SAMPLE_RATES_MODES.values())
|
55 |
+
|
56 |
+
def __init__(self, bin: tp.Union[Path, str], mode: str = "audio",
|
57 |
+
model: str = "libsvm_nu_svr_model.txt", debug: bool = False):
|
58 |
+
assert bin is not None and Path(bin).exists(), f"Could not find ViSQOL binary in specified path: {bin}"
|
59 |
+
self.visqol_bin = str(bin)
|
60 |
+
self.visqol_mode = mode
|
61 |
+
self.target_sr = self._get_target_sr(self.visqol_mode)
|
62 |
+
self.model = model
|
63 |
+
self.debug = debug
|
64 |
+
assert Path(self.visqol_model).exists(), \
|
65 |
+
f"Could not find the specified model in ViSQOL install: {self.visqol_model}"
|
66 |
+
|
67 |
+
def _get_target_sr(self, mode: str) -> int:
|
68 |
+
# returns target sampling rate for the corresponding ViSQOL mode.
|
69 |
+
if mode not in ViSQOL.SAMPLE_RATES_MODES:
|
70 |
+
raise ValueError(
|
71 |
+
f"Unsupported mode! Allowed are: {', '.join(ViSQOL.SAMPLE_RATES_MODES.keys())}"
|
72 |
+
)
|
73 |
+
return ViSQOL.SAMPLE_RATES_MODES[mode]
|
74 |
+
|
75 |
+
def _prepare_files(
|
76 |
+
self, ref_sig: torch.Tensor, deg_sig: torch.Tensor, sr: int, target_sr: int, pad_with_silence: bool = False
|
77 |
+
):
|
78 |
+
# prepare files for ViSQOL evaluation.
|
79 |
+
assert target_sr in ViSQOL.ALLOWED_SAMPLE_RATES
|
80 |
+
assert len(ref_sig) == len(deg_sig), (
|
81 |
+
"Expects same number of ref and degraded inputs",
|
82 |
+
f" but ref len {len(ref_sig)} != deg len {len(deg_sig)}"
|
83 |
+
)
|
84 |
+
# resample audio if needed
|
85 |
+
if sr != target_sr:
|
86 |
+
transform = torchaudio.transforms.Resample(sr, target_sr)
|
87 |
+
pad = int(0.5 * target_sr)
|
88 |
+
rs_ref = []
|
89 |
+
rs_deg = []
|
90 |
+
for i in range(len(ref_sig)):
|
91 |
+
rs_ref_i = transform(ref_sig[i])
|
92 |
+
rs_deg_i = transform(deg_sig[i])
|
93 |
+
if pad_with_silence:
|
94 |
+
rs_ref_i = torch.nn.functional.pad(rs_ref_i, (pad, pad), mode='constant', value=0)
|
95 |
+
rs_deg_i = torch.nn.functional.pad(rs_deg_i, (pad, pad), mode='constant', value=0)
|
96 |
+
rs_ref.append(rs_ref_i)
|
97 |
+
rs_deg.append(rs_deg_i)
|
98 |
+
ref_sig = torch.stack(rs_ref)
|
99 |
+
deg_sig = torch.stack(rs_deg)
|
100 |
+
# save audio chunks to tmp dir and create csv
|
101 |
+
tmp_dir = Path(tempfile.mkdtemp())
|
102 |
+
try:
|
103 |
+
tmp_input_csv_path = tmp_dir / "input.csv"
|
104 |
+
tmp_results_csv_path = tmp_dir / "results.csv"
|
105 |
+
tmp_debug_json_path = tmp_dir / "debug.json"
|
106 |
+
with open(tmp_input_csv_path, "w") as csv_file:
|
107 |
+
csv_writer = csv.writer(csv_file)
|
108 |
+
csv_writer.writerow(["reference", "degraded"])
|
109 |
+
for i in range(len(ref_sig)):
|
110 |
+
tmp_ref_filename = tmp_dir / f"ref_{i}.wav"
|
111 |
+
tmp_deg_filename = tmp_dir / f"deg_{i}.wav"
|
112 |
+
torchaudio.save(
|
113 |
+
tmp_ref_filename,
|
114 |
+
torch.clamp(ref_sig[i], min=-0.99, max=0.99),
|
115 |
+
sample_rate=target_sr,
|
116 |
+
bits_per_sample=16,
|
117 |
+
encoding="PCM_S"
|
118 |
+
)
|
119 |
+
torchaudio.save(
|
120 |
+
tmp_deg_filename,
|
121 |
+
torch.clamp(deg_sig[i], min=-0.99, max=0.99),
|
122 |
+
sample_rate=target_sr,
|
123 |
+
bits_per_sample=16,
|
124 |
+
encoding="PCM_S"
|
125 |
+
)
|
126 |
+
csv_writer.writerow([str(tmp_ref_filename), str(tmp_deg_filename)])
|
127 |
+
return tmp_dir, tmp_input_csv_path, tmp_results_csv_path, tmp_debug_json_path
|
128 |
+
except Exception as e:
|
129 |
+
logger.error("Exception occurred when preparing files for ViSQOL: %s", e)
|
130 |
+
return tmp_dir, None, None, None
|
131 |
+
|
132 |
+
def _flush_files(self, tmp_dir: tp.Union[Path, str]):
|
133 |
+
# flush tmp files used to compute ViSQOL.
|
134 |
+
shutil.rmtree(str(tmp_dir))
|
135 |
+
|
136 |
+
def _collect_moslqo_score(self, results_csv_path: tp.Union[Path, str]) -> float:
|
137 |
+
# collect results for each evaluated pair and return averaged moslqo score.
|
138 |
+
with open(results_csv_path, "r") as csv_file:
|
139 |
+
reader = csv.DictReader(csv_file)
|
140 |
+
moslqo_scores = [float(row["moslqo"]) for row in reader]
|
141 |
+
if len(moslqo_scores) > 0:
|
142 |
+
return sum(moslqo_scores) / len(moslqo_scores)
|
143 |
+
else:
|
144 |
+
return 0.0
|
145 |
+
|
146 |
+
def _collect_debug_data(self, debug_json_path: tp.Union[Path, str]) -> dict:
|
147 |
+
# collect debug data for the visqol inference.
|
148 |
+
with open(debug_json_path, "r") as f:
|
149 |
+
data = json.load(f)
|
150 |
+
return data
|
151 |
+
|
152 |
+
@property
|
153 |
+
def visqol_model(self):
|
154 |
+
return f'{self.visqol_bin}/model/{self.model}'
|
155 |
+
|
156 |
+
def _run_visqol(
|
157 |
+
self,
|
158 |
+
input_csv_path: tp.Union[Path, str],
|
159 |
+
results_csv_path: tp.Union[Path, str],
|
160 |
+
debug_csv_path: tp.Optional[tp.Union[Path, str]],
|
161 |
+
):
|
162 |
+
input_csv_path = str(input_csv_path)
|
163 |
+
results_csv_path = str(results_csv_path)
|
164 |
+
debug_csv_path = str(debug_csv_path)
|
165 |
+
cmd = [
|
166 |
+
f'{self.visqol_bin}/bazel-bin/visqol',
|
167 |
+
'--batch_input_csv', f'{input_csv_path}',
|
168 |
+
'--results_csv', f'{results_csv_path}'
|
169 |
+
]
|
170 |
+
if debug_csv_path is not None:
|
171 |
+
cmd += ['--output_debug', f'{debug_csv_path}']
|
172 |
+
if self.visqol_mode == "speech":
|
173 |
+
cmd += ['--use_speech_mode']
|
174 |
+
cmd += ['--similarity_to_quality_model', f'{self.visqol_model}']
|
175 |
+
result = subprocess.run(cmd, capture_output=True)
|
176 |
+
if result.returncode:
|
177 |
+
logger.error("Error with visqol: \n %s \n %s", result.stdout.decode(), result.stderr.decode())
|
178 |
+
raise RuntimeError("Error while executing visqol")
|
179 |
+
result.check_returncode()
|
180 |
+
|
181 |
+
def __call__(
|
182 |
+
self,
|
183 |
+
ref_sig: torch.Tensor,
|
184 |
+
deg_sig: torch.Tensor,
|
185 |
+
sr: int,
|
186 |
+
pad_with_silence: bool = False,
|
187 |
+
):
|
188 |
+
"""Calculate the ViSQOL metric for a pair of audio signals at a given sample rate.
|
189 |
+
Args:
|
190 |
+
ref_sig (torch.Tensor): Reference signals as [B, C, T].
|
191 |
+
deg_sig (torch.Tensor): Degraded signals as [B, C, T].
|
192 |
+
sr (int): Sample rate of the two audio signals.
|
193 |
+
pad_with_silence (bool): Whether to pad the file with silences as recommended
|
194 |
+
in visqol guidelines (see: https://github.com/google/visqol#general-guidelines-for-input).
|
195 |
+
Returns:
|
196 |
+
float: The ViSQOL score or mean score for the batch.
|
197 |
+
"""
|
198 |
+
logger.debug(f"Calculating visqol with mode={self.visqol_mode} on {len(ref_sig)} samples")
|
199 |
+
tmp_dir, input_csv, results_csv, debug_json = self._prepare_files(
|
200 |
+
ref_sig, deg_sig, sr, self.target_sr, pad_with_silence
|
201 |
+
)
|
202 |
+
try:
|
203 |
+
if input_csv and results_csv:
|
204 |
+
self._run_visqol(
|
205 |
+
input_csv,
|
206 |
+
results_csv,
|
207 |
+
debug_json if self.debug else None,
|
208 |
+
)
|
209 |
+
mosqol = self._collect_moslqo_score(results_csv)
|
210 |
+
return mosqol
|
211 |
+
else:
|
212 |
+
raise RuntimeError("Something unexpected happened when running VISQOL!")
|
213 |
+
except Exception as e:
|
214 |
+
logger.error("Exception occurred when running ViSQOL: %s", e)
|
215 |
+
finally:
|
216 |
+
self._flush_files(tmp_dir)
|