import soundfile import io from typing import Any, Tuple, Union, Optional import numpy as np import torch def preprocess_wav(data: Any, incoming_sample_rate) -> Tuple[np.ndarray, int]: segment, sample_rate = soundfile.read( io.BytesIO(data), dtype="float32", always_2d=True, frames=-1, start=0, format="RAW", subtype="PCM_16", samplerate=incoming_sample_rate, channels=1, ) return segment, sample_rate def convert_waveform( waveform: Union[np.ndarray, torch.Tensor], sample_rate: int, normalize_volume: bool = False, to_mono: bool = False, to_sample_rate: Optional[int] = None, ) -> Tuple[Union[np.ndarray, torch.Tensor], int]: """convert a waveform: - to a target sample rate - from multi-channel to mono channel - volume normalization Args: waveform (numpy.ndarray or torch.Tensor): 2D original waveform (channels x length) sample_rate (int): original sample rate normalize_volume (bool): perform volume normalization to_mono (bool): convert to mono channel if having multiple channels to_sample_rate (Optional[int]): target sample rate Returns: waveform (numpy.ndarray): converted 2D waveform (channels x length) sample_rate (float): target sample rate """ try: import torchaudio.sox_effects as ta_sox except ImportError: raise ImportError("Please install torchaudio: pip install torchaudio") effects = [] if normalize_volume: effects.append(["gain", "-n"]) if to_sample_rate is not None and to_sample_rate != sample_rate: effects.append(["rate", f"{to_sample_rate}"]) if to_mono and waveform.shape[0] > 1: effects.append(["channels", "1"]) if len(effects) > 0: is_np_input = isinstance(waveform, np.ndarray) _waveform = torch.from_numpy(waveform) if is_np_input else waveform converted, converted_sample_rate = ta_sox.apply_effects_tensor( _waveform, sample_rate, effects ) if is_np_input: converted = converted.numpy() return converted, converted_sample_rate return waveform, sample_rate