Jimmie commited on
Commit
0c693cc
1 Parent(s): 4b60c3a

initial commit

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ examples/*.wav filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['data', 'audios', 'metadata', 'to_consider', 'processed_metadata', 'repo_id', 'learner', 'categories', 'title',
5
+ 'description', 'mic', 'label', 'examples', 'intf', 'process_audio_exists', 'load_x', 'load_label_tfm',
6
+ 'classify_audio']
7
+
8
+ # %% app.ipynb 1
9
+ import torch
10
+ import gradio as gr
11
+ from gradio import CSVLogger
12
+ from fastai.vision.all import *
13
+ import torchaudio
14
+ import torchaudio.transforms as T
15
+ import warnings
16
+ from huggingface_hub import from_pretrained_fastai
17
+
18
+ # %% app.ipynb 2
19
+ warnings.filterwarnings("ignore")
20
+
21
+ # %% app.ipynb 3
22
+ def process_audio_exists(audio):
23
+ slice_name = audio.name
24
+
25
+ # check if slice name exists in new metadata file
26
+ row = processed_metadata.loc[processed_metadata['slice_file_name'] == slice_name].index.any()
27
+
28
+ return row
29
+
30
+ # %% app.ipynb 4
31
+ data = Path('examples')
32
+ audios = get_files(data, extensions='.wav')
33
+
34
+ metadata = pd.read_csv('UrbanSound8K.csv')
35
+ to_consider = ['siren', 'street_music', 'children_playing', 'dog_bark', 'car_horn']
36
+ processed_metadata = metadata.loc[metadata['class'].isin(to_consider)]
37
+ processed_metadata.loc[processed_metadata['class'] == 'siren', 'classID'] = 4
38
+ processed_metadata.loc[processed_metadata['class'] == 'street_music', 'classID'] = 0
39
+
40
+ # %% app.ipynb 5
41
+ class load_x(Transform):
42
+ def __init__(self):
43
+ self.sr = 44100
44
+ self.max_ms = 4000
45
+ self.channels = 2
46
+ # self.transform = transform
47
+ def rechannel(self, waveform, sr):
48
+ if (waveform.shape[0] == self.channels):
49
+ # no rechanneling needed
50
+ return waveform, sr
51
+
52
+ if (self.channels==1):
53
+ # converting stereo to mono
54
+ # by selecting the first channel
55
+ new_waveform = waveform[:1,:]
56
+
57
+ elif (self.channels==2):
58
+ # converting mono to stereo
59
+ # by duplicating the first channel
60
+ new_waveform = torch.cat([waveform, waveform])
61
+ return new_waveform, sr
62
+
63
+ def resample(self, waveform, sr):
64
+ if (sr==self.sr):
65
+ # no resampling needed
66
+ return waveform, sr
67
+
68
+ num_channels = waveform.shape[0]
69
+
70
+ # resample first channel
71
+ new_waveform = torchaudio.transforms.Resample(sr, self.sr)(waveform[:1,:])
72
+ if (num_channels) > 1:
73
+ # resample second channel and merge the two
74
+ re_two = torchaudio.transforms.Resample(sr, self.sr)(waveform[1:,:])
75
+ new_waveform = torch.cat([new_waveform, re_two])
76
+
77
+ return (new_waveform, self.sr)
78
+
79
+ def pad_trunc(self, waveform, sr):
80
+ num_channels, num_frames = waveform.shape
81
+ max_len = sr//1000 * self.max_ms
82
+
83
+ if (num_frames>max_len):
84
+ # truncate signal to given length
85
+ waveform = waveform[:,:max_len]
86
+
87
+ else:
88
+ # get padding lengths for beginning and end
89
+ begin_ln = random.randint(0, max_len-num_frames)
90
+ end_ln = max_len - num_frames - begin_ln
91
+
92
+ # pad the audio with zeros
93
+ pad_begin = torch.zeros((num_channels, begin_ln))
94
+ pad_end = torch.zeros((num_channels, end_ln))
95
+
96
+ waveform = torch.cat((pad_begin, waveform, pad_end), 1)
97
+
98
+ return (waveform, sr)
99
+
100
+ def mel_specgram(self, waveform, sr):
101
+ mel_tfm = T.MelSpectrogram(
102
+ sample_rate=sr,
103
+ n_fft=1024,
104
+ win_length=None,
105
+ hop_length=512,
106
+ center=True,
107
+ pad_mode="reflect",
108
+ power=2.0,
109
+ norm="slaney",
110
+ onesided=True,
111
+ n_mels=128,
112
+ mel_scale="htk")
113
+
114
+ spec = mel_tfm(waveform)
115
+
116
+ waveform = torchaudio.transforms.AmplitudeToDB(top_db=80)(spec)
117
+
118
+ return waveform, sr
119
+
120
+
121
+ def encodes(self, x):
122
+ waveform, sr = torchaudio.load(x)
123
+ waveform, sr = self.resample(waveform, sr)
124
+ waveform, sr = self.pad_trunc(waveform, sr)
125
+ waveform, sr = self.rechannel(waveform, sr)
126
+ waveform, sr = self.mel_specgram(waveform, sr)
127
+ return waveform
128
+
129
+
130
+ class load_label_tfm(Transform):
131
+ def __init__(self, metadata=processed_metadata): self.metadata = metadata
132
+ def encodes(self, x):
133
+ return self.metadata.loc[self.metadata['slice_file_name'] == x.name]['class'].item()
134
+
135
+ # %% app.ipynb 6
136
+ repo_id = "Jimmie/urban8k"
137
+
138
+ learner = from_pretrained_fastai(repo_id)
139
+
140
+ # %% app.ipynb 14
141
+ categories = tuple(learner.dls.vocab)
142
+
143
+ def classify_audio(audio):
144
+ # use Path to open audio
145
+ audio_path = Path(audio)
146
+ pred,idx,probs = learner.predict(audio_path)
147
+ return dict(zip(categories, map(float, probs)))
148
+
149
+ # %% app.ipynb 16
150
+ title = "Environmental Sound Classification"
151
+
152
+ description = """
153
+ This demo showcases how AI can be used to recognize environmental sounds. It focuses specifically on 5 classes: car_horn, children_playing, dog_bark, siren and street music
154
+
155
+
156
+ When uploading audio, make sure it is in .wav format and is less than 4 seconds long.
157
+
158
+ Enjoy!
159
+ """
160
+ mic = gr.Audio(source='upload', type="filepath", label='Upload Audio File here')
161
+ label = gr.outputs.Label()
162
+ examples = list(data.ls())
163
+
164
+ intf = gr.Interface(fn=classify_audio, inputs=mic, outputs=label, examples=examples,
165
+ title=title, description=description, cache_examples=False,
166
+ auto_submit_duration=5)
167
+
168
+ intf.launch(inline=False)
examples/car_horn.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:390a545a53dfe52f47a92876691eb40e64d1240c8885be7f72df3654b8fe70f8
3
+ size 705644
examples/children_playing.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a01b7b6f6e9d51a57a7abf1128518c68631f3c7095736f0364479c813e07ab8
3
+ size 768044
examples/dog_bark.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fefae5223783da73b535df8815dea61a285f444f0770228c9d9ec8ea5a2e65c7
3
+ size 705644
examples/siren.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7b54c8d4a92dbd21fdbe5ba3027a289fe2c4f636d14bacf7205b07543e26f78
3
+ size 768044
examples/street_music.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d976eeb884ede8c4c731bf616e197a40a7a9ecef47b9005e2a1f6acaec8888c3
3
+ size 1152080
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fastai<=2.7.11
2
+ torchaudio<=0.13.1