File size: 1,654 Bytes
1a942eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""Package which defines utility functions for voice conversion."""

import numpy as np
from numpy.typing import NDArray

import ffmpeg


def load_audio(file: str, sr: int) -> NDArray[np.float32]:
    """
    Load an audio file into a numpy array with a target sample rate.

    A subprocess is launched to decode the given audio file while
    down-mixing and resampling as necessary.

    Parameters
    ----------
    file : str
        Path to the audio file.
    sr : int
        Target sample rate.

    Returns
    -------
    NDArray[np.float32]
        Decoded audio file in numpy array format.

    Raises
    ------
    RuntimeError
        If the audio file cannot be loaded.

    See Also
    --------
    https://github.com/openai/whisper/blob/main/whisper/audio.py#L26

    Notes
    -----
    Requires the ffmpeg CLI and `typed-ffmpeg` package to be installed.

    """
    try:
        # NOTE prevent the input path from containing spaces and
        # carriage returns at the beginning and end.
        file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
        out, _ = (
            ffmpeg.input(file, threads=0)
            .output(
                filename="-",
                f="f32le",
                acodec="pcm_f32le",
                ac=1,
                ar=sr,
            )
            .run(
                cmd=["ffmpeg", "-nostdin"],
                capture_stdout=True,
                capture_stderr=True,
            )
        )

    except Exception as e:
        err_msg = f"Failed to load audio: {e}"
        raise RuntimeError(err_msg) from e

    return np.frombuffer(out, np.float32).flatten()