Spaces:
Running
Running
File size: 6,627 Bytes
9ace58a b3aa5aa 9ace58a b3aa5aa 9ace58a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import numpy as np
from . import wavfile
import warnings
import torch
import librosa
def time_to_x_coords(time_in_file, sampling_rate, fft_win_length, fft_overlap):
nfft = np.floor(fft_win_length*sampling_rate) # int() uses floor
noverlap = np.floor(fft_overlap*nfft)
return (time_in_file*sampling_rate-noverlap) / (nfft - noverlap)
# NOTE this is also defined in post_process
def x_coords_to_time(x_pos, sampling_rate, fft_win_length, fft_overlap):
nfft = np.floor(fft_win_length*sampling_rate)
noverlap = np.floor(fft_overlap*nfft)
return ((x_pos*(nfft - noverlap)) + noverlap) / sampling_rate
#return (1.0 - fft_overlap) * fft_win_length * (x_pos + 0.5) # 0.5 is for center of temporal window
def generate_spectrogram(audio, sampling_rate, params, return_spec_for_viz=False, check_spec_size=True):
# generate spectrogram
spec = gen_mag_spectrogram(audio, sampling_rate, params['fft_win_length'], params['fft_overlap'])
# crop to min/max freq
max_freq = round(params['max_freq']*params['fft_win_length'])
min_freq = round(params['min_freq']*params['fft_win_length'])
if spec.shape[0] < max_freq:
freq_pad = max_freq - spec.shape[0]
spec = np.vstack((np.zeros((freq_pad, spec.shape[1]), dtype=spec.dtype), spec))
spec_cropped = spec[-max_freq:spec.shape[0]-min_freq, :]
if params['spec_scale'] == 'log':
log_scaling = 2.0 * (1.0 / sampling_rate) * (1.0/(np.abs(np.hanning(int(params['fft_win_length']*sampling_rate)))**2).sum())
#log_scaling = (1.0 / sampling_rate)*0.1
#log_scaling = (1.0 / sampling_rate)*10e4
spec = np.log1p(log_scaling*spec_cropped)
elif params['spec_scale'] == 'pcen':
spec = pcen(spec_cropped, sampling_rate)
elif params['spec_scale'] == 'none':
pass
if params['denoise_spec_avg']:
spec = spec - np.mean(spec, 1)[:, np.newaxis]
spec.clip(min=0, out=spec)
if params['max_scale_spec']:
spec = spec / (spec.max() + 10e-6)
# needs to be divisible by specific factor - if not it should have been padded
#if check_spec_size:
#assert((int(spec.shape[0]*params['resize_factor']) % params['spec_divide_factor']) == 0)
#assert((int(spec.shape[1]*params['resize_factor']) % params['spec_divide_factor']) == 0)
# for visualization purposes - use log scaled spectrogram
if return_spec_for_viz:
log_scaling = 2.0 * (1.0 / sampling_rate) * (1.0/(np.abs(np.hanning(int(params['fft_win_length']*sampling_rate)))**2).sum())
spec_for_viz = np.log1p(log_scaling*spec_cropped).astype(np.float32)
else:
spec_for_viz = None
return spec, spec_for_viz
def load_audio_file(audio_file, time_exp_fact, target_samp_rate, scale=False, max_duration=False):
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=wavfile.WavFileWarning)
#sampling_rate, audio_raw = wavfile.read(audio_file)
audio_raw, sampling_rate = librosa.load(audio_file, sr=None)
if len(audio_raw.shape) > 1:
raise Exception('Currently does not handle stereo files')
sampling_rate = sampling_rate * time_exp_fact
# resample - need to do this after correcting for time expansion
sampling_rate_old = sampling_rate
sampling_rate = target_samp_rate
audio_raw = librosa.resample(audio_raw, orig_sr=sampling_rate_old, target_sr=sampling_rate, res_type='polyphase')
# clipping maximum duration
if max_duration is not False:
max_duration = np.minimum(int(sampling_rate*max_duration), audio_raw.shape[0])
audio_raw = audio_raw[:max_duration]
# convert to float32 and scale
audio_raw = audio_raw.astype(np.float32)
if scale:
audio_raw = audio_raw - audio_raw.mean()
audio_raw = audio_raw / (np.abs(audio_raw).max() + 10e-6)
return sampling_rate, audio_raw
def pad_audio(audio_raw, fs, ms, overlap_perc, resize_factor, divide_factor, fixed_width=None):
# Adds zeros to the end of the raw data so that the generated sepctrogram
# will be evenly divisible by `divide_factor`
# Also deals with very short audio clips and fixed_width during training
# This code could be clearer, clean up
nfft = int(ms*fs)
noverlap = int(overlap_perc*nfft)
step = nfft - noverlap
min_size = int(divide_factor*(1.0/resize_factor))
spec_width = ((audio_raw.shape[0]-noverlap)//step)
spec_width_rs = spec_width * resize_factor
if fixed_width is not None and spec_width < fixed_width:
# too small
# used during training to ensure all the batches are the same size
diff = fixed_width*step + noverlap - audio_raw.shape[0]
audio_raw = np.hstack((audio_raw, np.zeros(diff, dtype=audio_raw.dtype)))
elif fixed_width is not None and spec_width > fixed_width:
# too big
# used during training to ensure all the batches are the same size
diff = fixed_width*step + noverlap - audio_raw.shape[0]
audio_raw = audio_raw[:diff]
elif spec_width_rs < min_size or (np.floor(spec_width_rs) % divide_factor) != 0:
# need to be at least min_size
div_amt = np.ceil(spec_width_rs / float(divide_factor))
div_amt = np.maximum(1, div_amt)
target_size = int(div_amt*divide_factor*(1.0/resize_factor))
diff = target_size*step + noverlap - audio_raw.shape[0]
audio_raw = np.hstack((audio_raw, np.zeros(diff, dtype=audio_raw.dtype)))
return audio_raw
def gen_mag_spectrogram(x, fs, ms, overlap_perc):
# Computes magnitude spectrogram by specifying time.
x = x.astype(np.float32)
nfft = int(ms*fs)
noverlap = int(overlap_perc*nfft)
# window data
step = nfft - noverlap
# compute spec
spec, _ = librosa.core.spectrum._spectrogram(y=x, power=1, n_fft=nfft, hop_length=step, center=False)
# remove DC component and flip vertical orientation
spec = np.flipud(spec[1:, :])
return spec.astype(np.float32)
def gen_mag_spectrogram_pt(x, fs, ms, overlap_perc):
nfft = int(ms*fs)
nstep = round((1.0-overlap_perc)*nfft)
han_win = torch.hann_window(nfft, periodic=False).to(x.device)
complex_spec = torch.stft(x, nfft, nstep, window=han_win, center=False)
spec = complex_spec.pow(2.0).sum(-1)
# remove DC component and flip vertically
spec = torch.flipud(spec[0, 1:,:])
return spec
def pcen(spec_cropped, sampling_rate):
# TODO should be passing hop_length too i.e. step
spec = librosa.pcen(spec_cropped * (2**31), sr=sampling_rate/10).astype(np.float32)
return spec
|