Spaces:
Runtime error
Runtime error
# AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb. | |
# %% auto 0 | |
__all__ = ['data', 'audios', 'metadata', 'to_consider', 'processed_metadata', 'repo_id', 'learner', 'categories', 'title', | |
'description', 'mic', 'label', 'examples', 'intf', 'process_audio_exists', 'load_x', 'load_label_tfm', | |
'classify_audio'] | |
# %% app.ipynb 1 | |
import torch | |
import gradio as gr | |
from gradio import CSVLogger | |
from fastai.vision.all import * | |
import torchaudio | |
import torchaudio.transforms as T | |
import warnings | |
from huggingface_hub import from_pretrained_fastai | |
# %% app.ipynb 2 | |
warnings.filterwarnings("ignore") | |
# %% app.ipynb 3 | |
def process_audio_exists(audio): | |
slice_name = audio.name | |
# check if slice name exists in new metadata file | |
row = processed_metadata.loc[processed_metadata['slice_file_name'] == slice_name].index.any() | |
return row | |
# %% app.ipynb 4 | |
data = Path('examples') | |
audios = get_files(data, extensions='.wav') | |
metadata = pd.read_csv('UrbanSound8K.csv') | |
to_consider = ['siren', 'street_music', 'children_playing', 'dog_bark', 'car_horn'] | |
processed_metadata = metadata.loc[metadata['class'].isin(to_consider)] | |
processed_metadata.loc[processed_metadata['class'] == 'siren', 'classID'] = 4 | |
processed_metadata.loc[processed_metadata['class'] == 'street_music', 'classID'] = 0 | |
# %% app.ipynb 5 | |
class load_x(Transform): | |
def __init__(self): | |
self.sr = 44100 | |
self.max_ms = 4000 | |
self.channels = 2 | |
# self.transform = transform | |
def rechannel(self, waveform, sr): | |
if (waveform.shape[0] == self.channels): | |
# no rechanneling needed | |
return waveform, sr | |
if (self.channels==1): | |
# converting stereo to mono | |
# by selecting the first channel | |
new_waveform = waveform[:1,:] | |
elif (self.channels==2): | |
# converting mono to stereo | |
# by duplicating the first channel | |
new_waveform = torch.cat([waveform, waveform]) | |
return new_waveform, sr | |
def resample(self, waveform, sr): | |
if (sr==self.sr): | |
# no resampling needed | |
return waveform, sr | |
num_channels = waveform.shape[0] | |
# resample first channel | |
new_waveform = torchaudio.transforms.Resample(sr, self.sr)(waveform[:1,:]) | |
if (num_channels) > 1: | |
# resample second channel and merge the two | |
re_two = torchaudio.transforms.Resample(sr, self.sr)(waveform[1:,:]) | |
new_waveform = torch.cat([new_waveform, re_two]) | |
return (new_waveform, self.sr) | |
def pad_trunc(self, waveform, sr): | |
num_channels, num_frames = waveform.shape | |
max_len = sr//1000 * self.max_ms | |
if (num_frames>max_len): | |
# truncate signal to given length | |
waveform = waveform[:,:max_len] | |
else: | |
# get padding lengths for beginning and end | |
begin_ln = random.randint(0, max_len-num_frames) | |
end_ln = max_len - num_frames - begin_ln | |
# pad the audio with zeros | |
pad_begin = torch.zeros((num_channels, begin_ln)) | |
pad_end = torch.zeros((num_channels, end_ln)) | |
waveform = torch.cat((pad_begin, waveform, pad_end), 1) | |
return (waveform, sr) | |
def mel_specgram(self, waveform, sr): | |
mel_tfm = T.MelSpectrogram( | |
sample_rate=sr, | |
n_fft=1024, | |
win_length=None, | |
hop_length=512, | |
center=True, | |
pad_mode="reflect", | |
power=2.0, | |
norm="slaney", | |
onesided=True, | |
n_mels=128, | |
mel_scale="htk") | |
spec = mel_tfm(waveform) | |
waveform = torchaudio.transforms.AmplitudeToDB(top_db=80)(spec) | |
return waveform, sr | |
def encodes(self, x): | |
waveform, sr = torchaudio.load(x) | |
waveform, sr = self.resample(waveform, sr) | |
waveform, sr = self.pad_trunc(waveform, sr) | |
waveform, sr = self.rechannel(waveform, sr) | |
waveform, sr = self.mel_specgram(waveform, sr) | |
return waveform | |
class load_label_tfm(Transform): | |
def __init__(self, metadata=processed_metadata): self.metadata = metadata | |
def encodes(self, x): | |
return self.metadata.loc[self.metadata['slice_file_name'] == x.name]['class'].item() | |
# %% app.ipynb 6 | |
repo_id = "Jimmie/urban8k" | |
learner = from_pretrained_fastai(repo_id) | |
# %% app.ipynb 14 | |
categories = tuple(learner.dls.vocab) | |
def classify_audio(audio): | |
# use Path to open audio | |
audio_path = Path(audio) | |
pred,idx,probs = learner.predict(audio_path) | |
return dict(zip(categories, map(float, probs))) | |
# %% app.ipynb 16 | |
title = "Environmental Sound Classification" | |
description = """ | |
This demo showcases how AI can be used to recognize environmental sounds. It focuses specifically on 5 classes: car_horn, children_playing, dog_bark, siren and street music | |
When uploading audio, make sure it is in .wav format and is less than 4 seconds long. | |
Enjoy! | |
""" | |
mic = gr.Audio(source='upload', type="filepath", label='Upload Audio File here') | |
label = gr.outputs.Label() | |
examples = list(data.ls()) | |
intf = gr.Interface(fn=classify_audio, inputs=mic, outputs=label, examples=examples, | |
title=title, description=description, cache_examples=False, | |
auto_submit_duration=5) | |
intf.launch(inline=False) | |