Spaces:

macaodha
/

batdetect2

Running

App Files Files Community

batdetect2 / bat_detect /utils /audio_utils.py

macaodha

Update bat_detect/utils/audio_utils.py

b3aa5aa over 2 years ago

raw

history blame contribute delete

6.63 kB

	import numpy as np
	from . import wavfile
	import warnings
	import torch
	import librosa


	def time_to_x_coords(time_in_file, sampling_rate, fft_win_length, fft_overlap):
	nfft = np.floor(fft_win_length*sampling_rate) # int() uses floor
	noverlap = np.floor(fft_overlap*nfft)
	return (time_in_file*sampling_rate-noverlap) / (nfft - noverlap)


	# NOTE this is also defined in post_process
	def x_coords_to_time(x_pos, sampling_rate, fft_win_length, fft_overlap):
	nfft = np.floor(fft_win_length*sampling_rate)
	noverlap = np.floor(fft_overlap*nfft)
	return ((x_pos*(nfft - noverlap)) + noverlap) / sampling_rate
	#return (1.0 - fft_overlap) * fft_win_length * (x_pos + 0.5) # 0.5 is for center of temporal window


	def generate_spectrogram(audio, sampling_rate, params, return_spec_for_viz=False, check_spec_size=True):

	# generate spectrogram
	spec = gen_mag_spectrogram(audio, sampling_rate, params['fft_win_length'], params['fft_overlap'])

	# crop to min/max freq
	max_freq = round(params['max_freq']*params['fft_win_length'])
	min_freq = round(params['min_freq']*params['fft_win_length'])
	if spec.shape[0] < max_freq:
	freq_pad = max_freq - spec.shape[0]
	spec = np.vstack((np.zeros((freq_pad, spec.shape[1]), dtype=spec.dtype), spec))
	spec_cropped = spec[-max_freq:spec.shape[0]-min_freq, :]

	if params['spec_scale'] == 'log':
	log_scaling = 2.0 * (1.0 / sampling_rate) * (1.0/(np.abs(np.hanning(int(params['fft_win_length']sampling_rate)))*2).sum())
	#log_scaling = (1.0 / sampling_rate)*0.1
	#log_scaling = (1.0 / sampling_rate)*10e4
	spec = np.log1p(log_scaling*spec_cropped)
	elif params['spec_scale'] == 'pcen':
	spec = pcen(spec_cropped, sampling_rate)
	elif params['spec_scale'] == 'none':
	pass

	if params['denoise_spec_avg']:
	spec = spec - np.mean(spec, 1)[:, np.newaxis]
	spec.clip(min=0, out=spec)

	if params['max_scale_spec']:
	spec = spec / (spec.max() + 10e-6)

	# needs to be divisible by specific factor - if not it should have been padded
	#if check_spec_size:
	#assert((int(spec.shape[0]*params['resize_factor']) % params['spec_divide_factor']) == 0)
	#assert((int(spec.shape[1]*params['resize_factor']) % params['spec_divide_factor']) == 0)

	# for visualization purposes - use log scaled spectrogram
	if return_spec_for_viz:
	log_scaling = 2.0 * (1.0 / sampling_rate) * (1.0/(np.abs(np.hanning(int(params['fft_win_length']sampling_rate)))*2).sum())
	spec_for_viz = np.log1p(log_scaling*spec_cropped).astype(np.float32)
	else:
	spec_for_viz = None

	return spec, spec_for_viz


	def load_audio_file(audio_file, time_exp_fact, target_samp_rate, scale=False, max_duration=False):
	with warnings.catch_warnings():
	warnings.filterwarnings('ignore', category=wavfile.WavFileWarning)
	#sampling_rate, audio_raw = wavfile.read(audio_file)
	audio_raw, sampling_rate = librosa.load(audio_file, sr=None)

	if len(audio_raw.shape) > 1:
	raise Exception('Currently does not handle stereo files')
	sampling_rate = sampling_rate * time_exp_fact

	# resample - need to do this after correcting for time expansion
	sampling_rate_old = sampling_rate
	sampling_rate = target_samp_rate
	audio_raw = librosa.resample(audio_raw, orig_sr=sampling_rate_old, target_sr=sampling_rate, res_type='polyphase')

	# clipping maximum duration
	if max_duration is not False:
	max_duration = np.minimum(int(sampling_rate*max_duration), audio_raw.shape[0])
	audio_raw = audio_raw[:max_duration]

	# convert to float32 and scale
	audio_raw = audio_raw.astype(np.float32)
	if scale:
	audio_raw = audio_raw - audio_raw.mean()
	audio_raw = audio_raw / (np.abs(audio_raw).max() + 10e-6)

	return sampling_rate, audio_raw


	def pad_audio(audio_raw, fs, ms, overlap_perc, resize_factor, divide_factor, fixed_width=None):
	# Adds zeros to the end of the raw data so that the generated sepctrogram
	# will be evenly divisible by `divide_factor`
	# Also deals with very short audio clips and fixed_width during training

	# This code could be clearer, clean up
	nfft = int(ms*fs)
	noverlap = int(overlap_perc*nfft)
	step = nfft - noverlap
	min_size = int(divide_factor*(1.0/resize_factor))
	spec_width = ((audio_raw.shape[0]-noverlap)//step)
	spec_width_rs = spec_width * resize_factor

	if fixed_width is not None and spec_width < fixed_width:
	# too small
	# used during training to ensure all the batches are the same size
	diff = fixed_width*step + noverlap - audio_raw.shape[0]
	audio_raw = np.hstack((audio_raw, np.zeros(diff, dtype=audio_raw.dtype)))

	elif fixed_width is not None and spec_width > fixed_width:
	# too big
	# used during training to ensure all the batches are the same size
	diff = fixed_width*step + noverlap - audio_raw.shape[0]
	audio_raw = audio_raw[:diff]

	elif spec_width_rs < min_size or (np.floor(spec_width_rs) % divide_factor) != 0:
	# need to be at least min_size
	div_amt = np.ceil(spec_width_rs / float(divide_factor))
	div_amt = np.maximum(1, div_amt)
	target_size = int(div_amtdivide_factor(1.0/resize_factor))
	diff = target_size*step + noverlap - audio_raw.shape[0]
	audio_raw = np.hstack((audio_raw, np.zeros(diff, dtype=audio_raw.dtype)))

	return audio_raw


	def gen_mag_spectrogram(x, fs, ms, overlap_perc):
	# Computes magnitude spectrogram by specifying time.

	x = x.astype(np.float32)
	nfft = int(ms*fs)
	noverlap = int(overlap_perc*nfft)

	# window data
	step = nfft - noverlap

	# compute spec
	spec, _ = librosa.core.spectrum._spectrogram(y=x, power=1, n_fft=nfft, hop_length=step, center=False)

	# remove DC component and flip vertical orientation
	spec = np.flipud(spec[1:, :])

	return spec.astype(np.float32)


	def gen_mag_spectrogram_pt(x, fs, ms, overlap_perc):
	nfft = int(ms*fs)
	nstep = round((1.0-overlap_perc)*nfft)

	han_win = torch.hann_window(nfft, periodic=False).to(x.device)

	complex_spec = torch.stft(x, nfft, nstep, window=han_win, center=False)
	spec = complex_spec.pow(2.0).sum(-1)

	# remove DC component and flip vertically
	spec = torch.flipud(spec[0, 1:,:])

	return spec


	def pcen(spec_cropped, sampling_rate):
	# TODO should be passing hop_length too i.e. step
	spec = librosa.pcen(spec_cropped * (2**31), sr=sampling_rate/10).astype(np.float32)
	return spec