# AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb. # %% auto 0 __all__ = ['data', 'audios', 'metadata', 'to_consider', 'processed_metadata', 'repo_id', 'learner', 'categories', 'title', 'description', 'mic', 'label', 'examples', 'intf', 'process_audio_exists', 'load_x', 'load_label_tfm', 'classify_audio'] # %% app.ipynb 1 import torch import gradio as gr from gradio import CSVLogger from fastai.vision.all import * import torchaudio import torchaudio.transforms as T import warnings from huggingface_hub import from_pretrained_fastai # %% app.ipynb 2 warnings.filterwarnings("ignore") # %% app.ipynb 3 def process_audio_exists(audio): slice_name = audio.name # check if slice name exists in new metadata file row = processed_metadata.loc[processed_metadata['slice_file_name'] == slice_name].index.any() return row # %% app.ipynb 4 data = Path('examples') audios = get_files(data, extensions='.wav') metadata = pd.read_csv('UrbanSound8K.csv') to_consider = ['siren', 'street_music', 'children_playing', 'dog_bark', 'car_horn'] processed_metadata = metadata.loc[metadata['class'].isin(to_consider)] processed_metadata.loc[processed_metadata['class'] == 'siren', 'classID'] = 4 processed_metadata.loc[processed_metadata['class'] == 'street_music', 'classID'] = 0 # %% app.ipynb 5 class load_x(Transform): def __init__(self): self.sr = 44100 self.max_ms = 4000 self.channels = 2 # self.transform = transform def rechannel(self, waveform, sr): if (waveform.shape[0] == self.channels): # no rechanneling needed return waveform, sr if (self.channels==1): # converting stereo to mono # by selecting the first channel new_waveform = waveform[:1,:] elif (self.channels==2): # converting mono to stereo # by duplicating the first channel new_waveform = torch.cat([waveform, waveform]) return new_waveform, sr def resample(self, waveform, sr): if (sr==self.sr): # no resampling needed return waveform, sr num_channels = waveform.shape[0] # resample first channel new_waveform = torchaudio.transforms.Resample(sr, self.sr)(waveform[:1,:]) if (num_channels) > 1: # resample second channel and merge the two re_two = torchaudio.transforms.Resample(sr, self.sr)(waveform[1:,:]) new_waveform = torch.cat([new_waveform, re_two]) return (new_waveform, self.sr) def pad_trunc(self, waveform, sr): num_channels, num_frames = waveform.shape max_len = sr//1000 * self.max_ms if (num_frames>max_len): # truncate signal to given length waveform = waveform[:,:max_len] else: # get padding lengths for beginning and end begin_ln = random.randint(0, max_len-num_frames) end_ln = max_len - num_frames - begin_ln # pad the audio with zeros pad_begin = torch.zeros((num_channels, begin_ln)) pad_end = torch.zeros((num_channels, end_ln)) waveform = torch.cat((pad_begin, waveform, pad_end), 1) return (waveform, sr) def mel_specgram(self, waveform, sr): mel_tfm = T.MelSpectrogram( sample_rate=sr, n_fft=1024, win_length=None, hop_length=512, center=True, pad_mode="reflect", power=2.0, norm="slaney", onesided=True, n_mels=128, mel_scale="htk") spec = mel_tfm(waveform) waveform = torchaudio.transforms.AmplitudeToDB(top_db=80)(spec) return waveform, sr def encodes(self, x): waveform, sr = torchaudio.load(x) waveform, sr = self.resample(waveform, sr) waveform, sr = self.pad_trunc(waveform, sr) waveform, sr = self.rechannel(waveform, sr) waveform, sr = self.mel_specgram(waveform, sr) return waveform class load_label_tfm(Transform): def __init__(self, metadata=processed_metadata): self.metadata = metadata def encodes(self, x): return self.metadata.loc[self.metadata['slice_file_name'] == x.name]['class'].item() # %% app.ipynb 6 repo_id = "Jimmie/urban8k" learner = from_pretrained_fastai(repo_id) # %% app.ipynb 14 categories = tuple(learner.dls.vocab) def classify_audio(audio): # use Path to open audio audio_path = Path(audio) pred,idx,probs = learner.predict(audio_path) return dict(zip(categories, map(float, probs))) # %% app.ipynb 16 title = "Environmental Sound Classification" description = """ This demo showcases how AI can be used to recognize environmental sounds. It focuses specifically on 5 classes: car_horn, children_playing, dog_bark, siren and street music When uploading audio, make sure it is in .wav format and is less than 4 seconds long. Enjoy! """ mic = gr.Audio(source='upload', type="filepath", label='Upload Audio File here') label = gr.outputs.Label() examples = list(data.ls()) intf = gr.Interface(fn=classify_audio, inputs=mic, outputs=label, examples=examples, title=title, description=description, cache_examples=False, auto_submit_duration=5) intf.launch(inline=False)