File size: 4,774 Bytes
e18a750
 
 
 
 
 
 
 
 
4b3147f
 
e18a750
4b3147f
 
e18a750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b3147f
e18a750
 
 
 
 
 
 
 
 
4b3147f
e18a750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b3147f
e18a750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
import pandas as pd

import librosa

from pathlib import Path
from typing import Callable, Literal, Optional

def load_dataset(
    paths: list,
    remove_label: list = [""],
    sr: int = 22050,
    method  = "fix_length",
    max_time: float = 4.0):
    """Folder dataset in memory loader (return fully loaded pandas dataframe).
    - For sklearn, load the whole dataset if possible otherwise use `proportion` to only load a part of the dataset.
    - For pytorch, load the whole dataset if possible otherwise use `proportion` to only load a part of the dataset.
        And convert output to Tensor on the fly.

    Use `to_numpy(df.y)` to extract a numpy matrix with a (n_row, ...) shape.
    
    Expect a dataset folder structure as: paths = [paths1, paths2, ...]
        - paths1
            - sub1
                - blabla_GroundTruth1.wav
                - blabla_GroundTruth2.wav
            - sub2
                - ...
            ...
        - ...

    Args:
        paths (list[Path]): list of dataset directory to parse.
        remove_label (list, optional): list of label to remove. Defaults to None.. Defaults to [""].
        shuffle (bool, optional): True to suffle the dataframe. Defaults to True.
        proportion (float, optional): Proportion of file to load. Defaults to 1.0.
        sr (int, optional): Sample Rate to resample audio file. Defaults to 22050.
        method (Literal['fix_length';, 'time_stretch'], optional): uniformization method to apply. Defaults to "fix_length".
        max_time (float, optional): Common audio duration . Defaults to 4.0.

    Returns:
        df (pd.DataFrame): A pd.DataFrame with such define column: 
        - absolute_path (str): file-system absolute path of the .wav file.
        - labels (list): list of labels defining the sound file (ie, subdirectories and post _ filename).
        - ground_truth (str): ground_truth label meaning the last one after _ in the sound filename.
        - y_original_signal (np.ndarray): sound signal normalize as `float64` and resample with the given sr by `librosa.load`
        - y_original_duration (float): y_original_signal signal duration.
        - y_uniform (np.ndarray): uniformized sound signal compute from y_original_signal using the chosen uniform method.
        uniform_transform (Callable[[np.ndarray, int], np.ndarray]]): A lambda function to uniformized an audio signal as the same in df.
    """
    data = []
    uniform_transform = lambda y, sr: uniformize(y, sr, method, max_time)
    for path in paths:
        path = Path(path)
        for wav_file in path.rglob("*.wav"):
            wav_file_dict = dict()
            absolute_path = wav_file.absolute()
            *labels, label = absolute_path.relative_to(path.absolute()).parts
            label = label.replace(".wav", "").split("_")
            labels.extend(label)
            ground_truth = labels[-1]
            if ground_truth not in remove_label:
                y_original, sr = librosa.load(path=absolute_path, sr=sr) 
                # WARINING : Convert the sampling rate to 22.05 KHz, 
                # normalize the bit depth between -1 and 1 and convert stereo to mono
                wav_file_dict["absolute_path"] = absolute_path
                wav_file_dict["labels"] = labels
                wav_file_dict["ground_truth"] = ground_truth
                ## Save original sound signal
                wav_file_dict["y_original_signal"] = y_original
                duration = librosa.get_duration(y=y_original, sr=sr)
                wav_file_dict["y_original_duration"] = duration
                ## Save uniformized sound signal
                wav_file_dict["y_uniform"] = uniform_transform(y_original, sr)
                data.append(wav_file_dict)
    df = pd.DataFrame(data)
    return df, uniform_transform

def uniformize(
        audio: np.ndarray,
        sr: int,
        method = "fix_length", 
        max_time: float = 4.0
        ):
    if method == "fix_length":
        return librosa.util.fix_length(audio, size=int(np.ceil(max_time*sr)))
    elif method == "time_stretch":
        duration = librosa.get_duration(y=audio, sr=sr)
        return librosa.effects.time_stretch(audio, rate=duration/max_time)
    

def to_numpy(ds: pd.Series) -> np.ndarray:
    """Transform a pd.Series (ie columns slice) in a numpy array with the shape (n_row, cell_array.flatten()).

    Args:
        df (pd.Series): Columns to transform in numpy.

    Returns:
        np.ndarray: resulting np.array from the ds pd.Series.
    """
    numpy_df = np.stack([*ds.to_numpy()])
    C, *o = numpy_df.shape
    
    if o:
        return numpy_df.reshape(numpy_df.shape[0], np.prod(o))        
    else:
        return numpy_df.reshape(numpy_df.shape[0])