FrenchDroneKeyword / dataloading.py
blanchon's picture
to python3.9
4b3147f
raw
history blame
4.77 kB
import numpy as np
import pandas as pd
import librosa
from pathlib import Path
from typing import Callable, Literal, Optional
def load_dataset(
paths: list,
remove_label: list = [""],
sr: int = 22050,
method = "fix_length",
max_time: float = 4.0):
"""Folder dataset in memory loader (return fully loaded pandas dataframe).
- For sklearn, load the whole dataset if possible otherwise use `proportion` to only load a part of the dataset.
- For pytorch, load the whole dataset if possible otherwise use `proportion` to only load a part of the dataset.
And convert output to Tensor on the fly.
Use `to_numpy(df.y)` to extract a numpy matrix with a (n_row, ...) shape.
Expect a dataset folder structure as: paths = [paths1, paths2, ...]
- paths1
- sub1
- blabla_GroundTruth1.wav
- blabla_GroundTruth2.wav
- sub2
- ...
...
- ...
Args:
paths (list[Path]): list of dataset directory to parse.
remove_label (list, optional): list of label to remove. Defaults to None.. Defaults to [""].
shuffle (bool, optional): True to suffle the dataframe. Defaults to True.
proportion (float, optional): Proportion of file to load. Defaults to 1.0.
sr (int, optional): Sample Rate to resample audio file. Defaults to 22050.
method (Literal['fix_length';, 'time_stretch'], optional): uniformization method to apply. Defaults to "fix_length".
max_time (float, optional): Common audio duration . Defaults to 4.0.
Returns:
df (pd.DataFrame): A pd.DataFrame with such define column:
- absolute_path (str): file-system absolute path of the .wav file.
- labels (list): list of labels defining the sound file (ie, subdirectories and post _ filename).
- ground_truth (str): ground_truth label meaning the last one after _ in the sound filename.
- y_original_signal (np.ndarray): sound signal normalize as `float64` and resample with the given sr by `librosa.load`
- y_original_duration (float): y_original_signal signal duration.
- y_uniform (np.ndarray): uniformized sound signal compute from y_original_signal using the chosen uniform method.
uniform_transform (Callable[[np.ndarray, int], np.ndarray]]): A lambda function to uniformized an audio signal as the same in df.
"""
data = []
uniform_transform = lambda y, sr: uniformize(y, sr, method, max_time)
for path in paths:
path = Path(path)
for wav_file in path.rglob("*.wav"):
wav_file_dict = dict()
absolute_path = wav_file.absolute()
*labels, label = absolute_path.relative_to(path.absolute()).parts
label = label.replace(".wav", "").split("_")
labels.extend(label)
ground_truth = labels[-1]
if ground_truth not in remove_label:
y_original, sr = librosa.load(path=absolute_path, sr=sr)
# WARINING : Convert the sampling rate to 22.05 KHz,
# normalize the bit depth between -1 and 1 and convert stereo to mono
wav_file_dict["absolute_path"] = absolute_path
wav_file_dict["labels"] = labels
wav_file_dict["ground_truth"] = ground_truth
## Save original sound signal
wav_file_dict["y_original_signal"] = y_original
duration = librosa.get_duration(y=y_original, sr=sr)
wav_file_dict["y_original_duration"] = duration
## Save uniformized sound signal
wav_file_dict["y_uniform"] = uniform_transform(y_original, sr)
data.append(wav_file_dict)
df = pd.DataFrame(data)
return df, uniform_transform
def uniformize(
audio: np.ndarray,
sr: int,
method = "fix_length",
max_time: float = 4.0
):
if method == "fix_length":
return librosa.util.fix_length(audio, size=int(np.ceil(max_time*sr)))
elif method == "time_stretch":
duration = librosa.get_duration(y=audio, sr=sr)
return librosa.effects.time_stretch(audio, rate=duration/max_time)
def to_numpy(ds: pd.Series) -> np.ndarray:
"""Transform a pd.Series (ie columns slice) in a numpy array with the shape (n_row, cell_array.flatten()).
Args:
df (pd.Series): Columns to transform in numpy.
Returns:
np.ndarray: resulting np.array from the ds pd.Series.
"""
numpy_df = np.stack([*ds.to_numpy()])
C, *o = numpy_df.shape
if o:
return numpy_df.reshape(numpy_df.shape[0], np.prod(o))
else:
return numpy_df.reshape(numpy_df.shape[0])