Spaces:
Runtime error
Runtime error
# Make function to find classes in target directory | |
import os | |
import librosa | |
import torch | |
import numpy as np | |
from torchaudio.transforms import Resample | |
SAMPLE_RATE = 44100 | |
AUDIO_LEN = 2.90 | |
# Parameters to control the MelSpec generation | |
N_MELS = 128 | |
F_MIN = 20 | |
F_MAX = 16000 | |
N_FFT = 1024 | |
HOP_LEN = 512 | |
# Make function to find classes in target directory | |
def find_classes(directory: str): | |
# 1. Get the class names by scanning the target directory | |
classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir()) | |
# 2. Raise an error if class names not found | |
if not classes: | |
raise FileNotFoundError(f"Couldn't find any classes in {directory}.") | |
# 3. Crearte a dictionary of index labels (computers prefer numerical rather than string labels) | |
class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} | |
return classes, class_to_idx | |
def resample(wav, sample_rate, new_sample_rate): | |
if wav.shape[0] >= 2: | |
wav = torch.mean(wav, dim=0) | |
else: | |
wav = wav.squeeze(0) | |
if sample_rate > new_sample_rate: | |
resampler = Resample(sample_rate, new_sample_rate) | |
wav = resampler(wav) | |
return wav | |
def mono_to_color(X, eps=1e-6, mean=None, std=None): | |
X = np.stack([X, X, X], axis=-1) | |
# Standardize | |
mean = mean or X.mean() | |
std = std or X.std() | |
X = (X - mean) / (std + eps) | |
# Normalize to [0, 255] | |
_min, _max = X.min(), X.max() | |
if (_max - _min) > eps: | |
V = np.clip(X, _min, _max) | |
V = 255 * (V - _min) / (_max - _min) | |
V = V.astype(np.uint8) | |
else: | |
V = np.zeros_like(X, dtype=np.uint8) | |
return V | |
def normalize(image, mean=None, std=None): | |
image = image / 255.0 | |
if mean is not None and std is not None: | |
image = (image - mean) / std | |
return np.moveaxis(image, 2, 0).astype(np.float32) | |
def compute_melspec(wav, sample_rate=SAMPLE_RATE): | |
melspec = librosa.feature.melspectrogram( | |
y=wav, | |
sr=sample_rate, | |
n_fft=N_FFT, | |
fmin=F_MIN, | |
fmax=F_MAX, | |
n_mels=N_MELS, | |
hop_length=HOP_LEN | |
) | |
melspec = librosa.power_to_db(melspec).astype(np.float32) | |
return melspec | |
def audio_preprocess(wav, sample_rate): | |
wav = wav.numpy() | |
melspec = compute_melspec(wav, sample_rate) | |
image = mono_to_color(melspec) | |
image = normalize(image, mean=None, std=None) | |
image = torch.from_numpy(image) | |
return image |