kmiyasar commited on
Commit
aa7d0f8
·
1 Parent(s): 9857f2e

add gradio and requirement.txt

Browse files
Files changed (2) hide show
  1. app.py +281 -0
  2. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # import scipy.io.matlab
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import librosa as lb
6
+ import soundfile as sf
7
+ import sys
8
+ import pickle
9
+ import tensorflow as tf
10
+ from tensorflow.keras import Input,Model
11
+ from tensorflow.keras.layers import Dense,Dropout,Bidirectional,LSTM
12
+ import torch
13
+ import wave
14
+ import gradio as gr
15
+ from pydub import AudioSegment
16
+
17
+ torch.set_num_threads(1)
18
+
19
+ FRAME_TIME=80*10**(-3) # to try 60ms, 80ms,100ms, 110, 120,
20
+ HOP_TIME=10*10**(-3)
21
+ S_FRAME_TIME=10*10**(-3) # changed from 20ms to 32 ms to adjust FFT length
22
+ S_HOP_TIME=4.1*10**(-3) # 4.5for 80ms,4 for 100ms, 4 for 110ms,4.1 for 120
23
+ THRESHOLD_TIME=FRAME_TIME
24
+ SAMPLING_RATE=16000
25
+ N_MFCC=20
26
+ BREATH_THRESHOLD=100*10**(-3)
27
+ BREATH_TO_BREATH_TIME=150*10**(-3)
28
+ VAD_THRESHOLD=0.1
29
+ join=0
30
+ remove=1
31
+ classifier_threshold=0.5
32
+ # Specify the path to your pickle file
33
+ pickle_file_path = 'Normalisation_parameters_2018_full_data.pickle'
34
+ ModelWeightFilepath='Breath_detection_3BILSTM_2018_full_data_80ms_10ms_10ms_best_weights.hdf5'
35
+ # global model, utils, original_task_model, get_speech_timestamps, read_audio, Feature_mean, Feature_std
36
+ # ***********************************************
37
+ # Initialisation
38
+ # ***********************************************
39
+ print("Reading normalisation parameters")
40
+ try:
41
+ # Open the file in binary read mode
42
+ with open(pickle_file_path, 'rb') as file:
43
+ # Load the object from the file
44
+ Feature_mean,Feature_std = pickle.load(file)
45
+ print("Object loaded successfully!")
46
+ print(Feature_mean.shape,Feature_std.shape)
47
+ except Exception as e:
48
+ print(f"An error occurred: {e}")
49
+
50
+ print("Initialising the Breath Detection model")
51
+ lstm_1= 24
52
+ l2_1= 0.02
53
+ drop_1= 0.25
54
+ lstm_2= 8
55
+ l2_2= 0.04
56
+ drop_2= 0.3
57
+ lstm_3= 24
58
+ l2_3= 0.03
59
+ drop_3= 0.45
60
+ lr= 0.0001
61
+
62
+ input = Input(shape=Feature_mean.shape)
63
+ # print(input.shape)
64
+ lay1=Bidirectional(LSTM(lstm_1,activation='tanh',kernel_regularizer=tf.keras.regularizers.l2(l2_1),
65
+ return_sequences=True))(input)
66
+ lay1=Dropout(drop_1)(lay1)
67
+ # print(lay1.shape)
68
+
69
+ lay2=Bidirectional(LSTM(lstm_2,activation='tanh',kernel_regularizer=tf.keras.regularizers.l2(l2_2),
70
+ return_sequences=True))(lay1)
71
+ lay2=Dropout(drop_2)(lay2)
72
+ # print(lay2.shape)
73
+
74
+ lay3=Bidirectional(LSTM(lstm_3,activation='tanh',kernel_regularizer=tf.keras.regularizers.l2(l2_3),
75
+ return_sequences=False))(lay2)
76
+ lay3=Dropout(drop_3)(lay3)
77
+ # print(lay3.shape)
78
+
79
+ output=Dense(1,activation='sigmoid')(lay3)
80
+ # print(output.shape)
81
+ original_task_model=Model(inputs=input,outputs=output,name='BILSTM_model')
82
+ # original_task_model.summary()
83
+ original_task_model.load_weights(ModelWeightFilepath)
84
+
85
+ print("Initialising Voice Activity Detection Model")
86
+ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
87
+ model='silero_vad',
88
+ force_reload=True)
89
+
90
+ (get_speech_timestamps,_, read_audio,*_) = utils
91
+
92
+ def speech_feature_melspect(speech_seg,Fs,frame_length,hop_length,s_frame_length,s_hop_length):
93
+ Feat=[]
94
+ Feature_min=[]
95
+ Feature_max=[]
96
+ index_start=0;
97
+ index_end=frame_length;
98
+ fft_length=int(2**np.ceil(np.log(int(s_frame_length))/np.log(2)))
99
+ speech_seg = lb.effects.preemphasis(speech_seg)
100
+ while index_end<len(speech_seg):
101
+ s_frame=speech_seg[range(index_start,index_end)]
102
+ cepst=lb.feature.melspectrogram(y=s_frame.reshape((-1,)),sr=Fs,n_fft=fft_length,win_length=s_frame_length,
103
+ hop_length=s_hop_length,window='hann',n_mels=60,power=1)
104
+ cepst=lb.power_to_db(cepst, ref=np.max)
105
+ Feat.append(cepst)
106
+ index_start += hop_length;
107
+ index_end += hop_length;
108
+ Feat=np.array(Feat)
109
+ return Feat
110
+
111
+ def read_speech_derive_vad (speech_file_path,sampling_rate,original_task_model,Feature_mean,Feature_std):
112
+
113
+ # sampling_rate = SAMPLING_RATE # also accepts 8000
114
+ wav = read_audio(speech_file_path, sampling_rate=SAMPLING_RATE)
115
+ # get speech timestamps from full audio file
116
+ speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate)
117
+ index_vad = []
118
+ for item in speech_timestamps:
119
+ index_vad.extend([item['start'],item['end']])
120
+ if index_vad[0] != 0:
121
+ index_vad = [0] + index_vad
122
+ if index_vad[-1] != len(wav):
123
+ index_vad.append(len(wav))
124
+ else:
125
+ index_vad = index_vad[:-1]
126
+ index_vad = np.array(index_vad)
127
+ speech,Fs=lb.load(speech_file_path,sr=sampling_rate)
128
+ speech_scaled=speech/max(abs(speech))
129
+ return speech,speech_scaled,index_vad
130
+
131
+ def remove_small_breaths(index_b,threshold_breath,speech_b_detect):
132
+ for i in range(int(np.size(index_b)/2)):
133
+ b_length=index_b[0,2*i+1]-index_b[0,2*i]
134
+ if b_length <= threshold_breath:
135
+ speech_b_detect[range(int(index_b[0,2*i]),int(index_b[0,2*i+1])+1)]=0
136
+
137
+ index_b=np.argwhere(abs(np.diff(speech_b_detect))==1)
138
+ if speech_b_detect[0]==1:
139
+ index_b=np.insert(index_b,0,0)
140
+
141
+ if speech_b_detect[-1]==1:
142
+ index_b=np.append(index_b,len(speech_b_detect))
143
+ index_b=np.reshape(index_b,(1,-1))
144
+
145
+ return index_b,speech_b_detect
146
+
147
+ def join_close_breaths(index_b,threshold_breath_to_breath,speech_b_detect):
148
+ for i in range(int(np.size(index_b)/2)-1):
149
+ bb_length=index_b[0,2*i+2]-index_b[0,2*i+1]
150
+ if bb_length <= threshold_breath_to_breath:
151
+ speech_b_detect[range(int(index_b[0,2*i+1]),int(index_b[0,2*i+2])+1)]=1
152
+
153
+ index_b=np.argwhere(abs(np.diff(speech_b_detect))==1)
154
+ if speech_b_detect[0]==1:
155
+ index_b=np.insert(index_b,0,0)
156
+
157
+ if speech_b_detect[-1]==1:
158
+ index_b=np.append(index_b,len(speech_b_detect))
159
+ index_b=np.reshape(index_b,(1,-1))
160
+ frame_length=int(np.floor(FRAME_TIME*SAMPLING_RATE))
161
+ hop_length=int(np.floor(HOP_TIME*SAMPLING_RATE))
162
+ offset = frame_length - hop_length
163
+ for i in range(int(np.size(index_b)/2)):
164
+ index_b[2*i+1] = index_b[2*i+1] + offset
165
+ speech_b_detect[range(int(index_b[0,2*i]),int(index_b[0,2*i+1])+1)]=1
166
+
167
+ return index_b,speech_b_detect
168
+
169
+ # ***********************************************
170
+ def detect_breath_from_speed_vad(speech,index_vad):
171
+ index_vad=np.reshape(index_vad,(1,-1))
172
+
173
+ frame_length=int(np.floor(FRAME_TIME*SAMPLING_RATE))
174
+ hop_length=int(np.floor(HOP_TIME*SAMPLING_RATE))
175
+ s_frame_length=int(np.floor(S_FRAME_TIME*SAMPLING_RATE))
176
+ s_hop_length=int(np.floor(S_HOP_TIME*SAMPLING_RATE))
177
+
178
+ speech_b_detect=np.zeros(np.size(speech))
179
+
180
+ for vi in range(int(np.size(index_vad)/2)):
181
+ index_start=index_vad[0,2*vi]
182
+ index_end=index_vad[0,2*vi+1]
183
+ speech_seg=speech[index_start:index_end]
184
+ if (len(speech_seg)> frame_length+1):
185
+ feature=speech_feature_melspect(speech_seg, SAMPLING_RATE,
186
+ frame_length, hop_length,
187
+ s_frame_length, s_hop_length)
188
+ feature=(feature-Feature_mean)/Feature_std
189
+ prediction=original_task_model.predict(feature)
190
+ y_pred=np.array(list(map(int,prediction>classifier_threshold)))
191
+ if sum(y_pred)>2:
192
+ detect_point=np.argwhere(y_pred==1)
193
+ speech_b_detect[int(index_start+detect_point[0]*hop_length):int(index_start+(detect_point[-1]+1)*hop_length)]=1
194
+
195
+ index_b=np.argwhere(abs(np.diff(speech_b_detect))==1)
196
+ if speech_b_detect[0]==1:
197
+ index_b=np.insert(index_b,0,0)
198
+
199
+ if speech_b_detect[-1]==1:
200
+ index_b=np.append(index_b,len(speech_b_detect))
201
+ index_b=np.reshape(index_b,(1,-1))
202
+ index_b1=index_b.copy()
203
+ threshold_breath=BREATH_THRESHOLD*SAMPLING_RATE
204
+ threshold_breath_to_breath=BREATH_TO_BREATH_TIME*SAMPLING_RATE
205
+
206
+ if join==1:
207
+ index_b,speech_b_detect=join_close_breaths(index_b,threshold_breath_to_breath,speech_b_detect)
208
+ if remove==1:
209
+ index_b,speech_b_detect=remove_small_breaths(index_b,threshold_breath,speech_b_detect)
210
+
211
+
212
+ return speech_b_detect
213
+
214
+ def detect_breath_from_speed(speech_file_path,original_task_model,Feature_mean,Feature_std):
215
+ print("Finding Voice Activity Deteciton")
216
+ speech,speech_scaled,index_vad=read_speech_derive_vad(speech_file_path,SAMPLING_RATE,original_task_model,Feature_mean,Feature_std)
217
+ print("Detecting Breath sound in speech")
218
+ speech_b_detect=detect_breath_from_speed_vad(speech,index_vad)
219
+ return speech,speech_b_detect
220
+
221
+ def plot_waveform(speech,SAMPLING_RATE,speech_b_detect):
222
+ # Read the audio file
223
+ # Create the X values based on the length of the speech data and the sampling rate
224
+ X = np.divide(range(0, len(speech)), SAMPLING_RATE)
225
+
226
+ # Create a figure
227
+ plt.figure(figsize=(12, 8))
228
+
229
+ # Define font size
230
+ font_size = 24
231
+
232
+ # Second subplot: Speech, Detected breath, and True breath
233
+ plt.subplot(3, 1, 2)
234
+ plt.plot(X, speech, label="Speech", color='blue', linewidth=2)
235
+ plt.plot(X, 0.15 * speech_b_detect, label="Detected breath", color='red', linewidth=3)
236
+ plt.title(f"Speech and detected breaths", fontsize=24)
237
+ plt.legend(fontsize=12)
238
+ plt.xlabel("Time (seconds)", fontsize=20)
239
+ plt.ylabel("Amplitude", fontsize=20)
240
+ plt.grid(True)
241
+
242
+ # Save to a file
243
+ output_image_file = "waveform.png"
244
+ plt.savefig(output_image_file)
245
+ plt.close()
246
+
247
+ return output_image_file
248
+ # if __name__ == "__main__":
249
+ # speech_file_path = 'DATA\Introductory\C1W1L01.wav'
250
+ # original_task_model,Feature_mean,Feature_std = initialisation()
251
+
252
+
253
+ def gradio_interface(image_file,input_audio_file):
254
+ # Load the audio file
255
+ audio = AudioSegment.from_file(input_audio_file)
256
+ # Process the audio (e.g., normalize)
257
+ processed_audio = audio.normalize()
258
+ # Export the processed audio to a file
259
+ speech_file_path = "input_audio.wav"
260
+ processed_audio.export(speech_file_path, format="wav")
261
+ speech,speech_b_detect = detect_breath_from_speed(speech_file_path,original_task_model,Feature_mean,Feature_std)
262
+ breath_output = 50*np.multiply(speech,speech_b_detect)
263
+ breath_enhanced_speech = speech + breath_output
264
+ print("Writing output file")
265
+ output_audio_file = "Output/Breath_v1.wav"
266
+ sf.write(output_audio_file, breath_enhanced_speech, samplerate= SAMPLING_RATE,format='WAV')
267
+ output_image_file = plot_waveform(speech,SAMPLING_RATE,speech_b_detect)
268
+ return output_image_file,output_audio_file
269
+
270
+ # Create the Gradio interface
271
+ default_image = "Text.png"
272
+ iface = gr.Interface(
273
+ fn=gradio_interface,
274
+ inputs=[gr.Image(type="filepath", value=default_image,interactive=False),gr.Audio(sources=["microphone","upload"], type="filepath")],
275
+ outputs=[gr.Image(type="filepath"),gr.Audio(type="filepath")],
276
+ title="Breath sound Detector",
277
+ description="Record your speech reading the given paragraph. The audio will be processed and the breath detection will be performed. The detected breath will be displayed in the image and the breath enhanced speech can be heard.",
278
+ )
279
+
280
+ # Launch the Gradio interface
281
+ iface.launch()
requirements.txt ADDED
Binary file (5.64 kB). View file