Spaces:
Paused
Paused
File size: 6,512 Bytes
9d3cb0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import json
import shlex
import subprocess
import tempfile
from pathlib import Path
from typing import Tuple
import ffmpy
import numpy as np
import torch
def r128stats(filepath: str, quiet: bool):
"""Takes a path to an audio file, returns a dict with the loudness
stats computed by the ffmpeg ebur128 filter.
Parameters
----------
filepath : str
Path to compute loudness stats on.
quiet : bool
Whether to show FFMPEG output during computation.
Returns
-------
dict
Dictionary containing loudness stats.
"""
ffargs = [
"ffmpeg",
"-nostats",
"-i",
filepath,
"-filter_complex",
"ebur128",
"-f",
"null",
"-",
]
if quiet:
ffargs += ["-hide_banner"]
proc = subprocess.Popen(ffargs, stderr=subprocess.PIPE, universal_newlines=True)
stats = proc.communicate()[1]
summary_index = stats.rfind("Summary:")
summary_list = stats[summary_index:].split()
i_lufs = float(summary_list[summary_list.index("I:") + 1])
i_thresh = float(summary_list[summary_list.index("I:") + 4])
lra = float(summary_list[summary_list.index("LRA:") + 1])
lra_thresh = float(summary_list[summary_list.index("LRA:") + 4])
lra_low = float(summary_list[summary_list.index("low:") + 1])
lra_high = float(summary_list[summary_list.index("high:") + 1])
stats_dict = {
"I": i_lufs,
"I Threshold": i_thresh,
"LRA": lra,
"LRA Threshold": lra_thresh,
"LRA Low": lra_low,
"LRA High": lra_high,
}
return stats_dict
def ffprobe_offset_and_codec(path: str) -> Tuple[float, str]:
"""Given a path to a file, returns the start time offset and codec of
the first audio stream.
"""
ff = ffmpy.FFprobe(
inputs={path: None},
global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,codec_name,start_pts,time_base -of json -v quiet",
)
streams = json.loads(ff.run(stdout=subprocess.PIPE)[0])["streams"]
seconds_offset = 0.0
codec = None
# Get the offset and codec of the first audio stream we find
# and return its start time, if it has one.
for stream in streams:
if stream["codec_type"] == "audio":
seconds_offset = stream.get("start_time", 0.0)
codec = stream.get("codec_name")
break
return float(seconds_offset), codec
class FFMPEGMixin:
_loudness = None
def ffmpeg_loudness(self, quiet: bool = True):
"""Computes loudness of audio file using FFMPEG.
Parameters
----------
quiet : bool, optional
Whether to show FFMPEG output during computation,
by default True
Returns
-------
torch.Tensor
Loudness of every item in the batch, computed via
FFMPEG.
"""
loudness = []
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
for i in range(self.batch_size):
self[i].write(f.name)
loudness_stats = r128stats(f.name, quiet=quiet)
loudness.append(loudness_stats["I"])
self._loudness = torch.from_numpy(np.array(loudness)).float()
return self.loudness()
def ffmpeg_resample(self, sample_rate: int, quiet: bool = True):
"""Resamples AudioSignal using FFMPEG. More memory-efficient
than using julius.resample for long audio files.
Parameters
----------
sample_rate : int
Sample rate to resample to.
quiet : bool, optional
Whether to show FFMPEG output during computation,
by default True
Returns
-------
AudioSignal
Resampled AudioSignal.
"""
from audiotools import AudioSignal
if sample_rate == self.sample_rate:
return self
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
self.write(f.name)
f_out = f.name.replace("wav", "rs.wav")
command = f"ffmpeg -i {f.name} -ar {sample_rate} {f_out}"
if quiet:
command += " -hide_banner -loglevel error"
subprocess.check_call(shlex.split(command))
resampled = AudioSignal(f_out)
Path.unlink(Path(f_out))
return resampled
@classmethod
def load_from_file_with_ffmpeg(cls, audio_path: str, quiet: bool = True, **kwargs):
"""Loads AudioSignal object after decoding it to a wav file using FFMPEG.
Useful for loading audio that isn't covered by librosa's loading mechanism. Also
useful for loading mp3 files, without any offset.
Parameters
----------
audio_path : str
Path to load AudioSignal from.
quiet : bool, optional
Whether to show FFMPEG output during computation,
by default True
Returns
-------
AudioSignal
AudioSignal loaded from file with FFMPEG.
"""
audio_path = str(audio_path)
with tempfile.TemporaryDirectory() as d:
wav_file = str(Path(d) / "extracted.wav")
padded_wav = str(Path(d) / "padded.wav")
global_options = "-y"
if quiet:
global_options += " -loglevel error"
ff = ffmpy.FFmpeg(
inputs={audio_path: None},
outputs={wav_file: None},
global_options=global_options,
)
ff.run()
# We pad the file using the start time offset in case it's an audio
# stream starting at some offset in a video container.
pad, codec = ffprobe_offset_and_codec(audio_path)
# For mp3s, don't pad files with discrepancies less than 0.027s -
# it's likely due to codec latency. The amount of latency introduced
# by mp3 is 1152, which is 0.0261 44khz. So we set the threshold
# here slightly above that.
# Source: https://lame.sourceforge.io/tech-FAQ.txt.
if codec == "mp3" and pad < 0.027:
pad = 0.0
ff = ffmpy.FFmpeg(
inputs={wav_file: None},
outputs={padded_wav: f"-af 'adelay={pad*1000}:all=true'"},
global_options=global_options,
)
ff.run()
signal = cls(padded_wav, **kwargs)
return signal
|