File size: 2,456 Bytes
2da7ed3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import soundfile
import io
from typing import Any, Tuple, Union, Optional
import numpy as np
import torch

def preprocess_wav(data: Any, incoming_sample_rate) -> Tuple[np.ndarray, int]:
        segment, sample_rate = soundfile.read(
            io.BytesIO(data),
            dtype="float32",
            always_2d=True,
            frames=-1,
            start=0,
            format="RAW",
            subtype="PCM_16",
            samplerate=incoming_sample_rate,
            channels=1,
        )
        return segment, sample_rate

def convert_waveform(
        waveform: Union[np.ndarray, torch.Tensor],
        sample_rate: int,
        normalize_volume: bool = False,
        to_mono: bool = False,
        to_sample_rate: Optional[int] = None,
    ) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
        """convert a waveform:
        - to a target sample rate
        - from multi-channel to mono channel
        - volume normalization

        Args:
            waveform (numpy.ndarray or torch.Tensor): 2D original waveform
                (channels x length)
            sample_rate (int): original sample rate
            normalize_volume (bool): perform volume normalization
            to_mono (bool): convert to mono channel if having multiple channels
            to_sample_rate (Optional[int]): target sample rate
        Returns:
            waveform (numpy.ndarray): converted 2D waveform (channels x length)
            sample_rate (float): target sample rate
        """
        try:
            import torchaudio.sox_effects as ta_sox
        except ImportError:
            raise ImportError("Please install torchaudio: pip install torchaudio")

        effects = []
        if normalize_volume:
            effects.append(["gain", "-n"])
        if to_sample_rate is not None and to_sample_rate != sample_rate:
            effects.append(["rate", f"{to_sample_rate}"])
        if to_mono and waveform.shape[0] > 1:
            effects.append(["channels", "1"])
        if len(effects) > 0:
            is_np_input = isinstance(waveform, np.ndarray)
            _waveform = torch.from_numpy(waveform) if is_np_input else waveform
            converted, converted_sample_rate = ta_sox.apply_effects_tensor(
                _waveform, sample_rate, effects
            )
            if is_np_input:
                converted = converted.numpy()
            return converted, converted_sample_rate
        return waveform, sample_rate