Spaces:
Runtime error
Runtime error
add gradio and requirement.txt
Browse files- app.py +281 -0
- requirements.txt +0 -0
app.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
# import scipy.io.matlab
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import librosa as lb
|
6 |
+
import soundfile as sf
|
7 |
+
import sys
|
8 |
+
import pickle
|
9 |
+
import tensorflow as tf
|
10 |
+
from tensorflow.keras import Input,Model
|
11 |
+
from tensorflow.keras.layers import Dense,Dropout,Bidirectional,LSTM
|
12 |
+
import torch
|
13 |
+
import wave
|
14 |
+
import gradio as gr
|
15 |
+
from pydub import AudioSegment
|
16 |
+
|
17 |
+
torch.set_num_threads(1)
|
18 |
+
|
19 |
+
FRAME_TIME=80*10**(-3) # to try 60ms, 80ms,100ms, 110, 120,
|
20 |
+
HOP_TIME=10*10**(-3)
|
21 |
+
S_FRAME_TIME=10*10**(-3) # changed from 20ms to 32 ms to adjust FFT length
|
22 |
+
S_HOP_TIME=4.1*10**(-3) # 4.5for 80ms,4 for 100ms, 4 for 110ms,4.1 for 120
|
23 |
+
THRESHOLD_TIME=FRAME_TIME
|
24 |
+
SAMPLING_RATE=16000
|
25 |
+
N_MFCC=20
|
26 |
+
BREATH_THRESHOLD=100*10**(-3)
|
27 |
+
BREATH_TO_BREATH_TIME=150*10**(-3)
|
28 |
+
VAD_THRESHOLD=0.1
|
29 |
+
join=0
|
30 |
+
remove=1
|
31 |
+
classifier_threshold=0.5
|
32 |
+
# Specify the path to your pickle file
|
33 |
+
pickle_file_path = 'Normalisation_parameters_2018_full_data.pickle'
|
34 |
+
ModelWeightFilepath='Breath_detection_3BILSTM_2018_full_data_80ms_10ms_10ms_best_weights.hdf5'
|
35 |
+
# global model, utils, original_task_model, get_speech_timestamps, read_audio, Feature_mean, Feature_std
|
36 |
+
# ***********************************************
|
37 |
+
# Initialisation
|
38 |
+
# ***********************************************
|
39 |
+
print("Reading normalisation parameters")
|
40 |
+
try:
|
41 |
+
# Open the file in binary read mode
|
42 |
+
with open(pickle_file_path, 'rb') as file:
|
43 |
+
# Load the object from the file
|
44 |
+
Feature_mean,Feature_std = pickle.load(file)
|
45 |
+
print("Object loaded successfully!")
|
46 |
+
print(Feature_mean.shape,Feature_std.shape)
|
47 |
+
except Exception as e:
|
48 |
+
print(f"An error occurred: {e}")
|
49 |
+
|
50 |
+
print("Initialising the Breath Detection model")
|
51 |
+
lstm_1= 24
|
52 |
+
l2_1= 0.02
|
53 |
+
drop_1= 0.25
|
54 |
+
lstm_2= 8
|
55 |
+
l2_2= 0.04
|
56 |
+
drop_2= 0.3
|
57 |
+
lstm_3= 24
|
58 |
+
l2_3= 0.03
|
59 |
+
drop_3= 0.45
|
60 |
+
lr= 0.0001
|
61 |
+
|
62 |
+
input = Input(shape=Feature_mean.shape)
|
63 |
+
# print(input.shape)
|
64 |
+
lay1=Bidirectional(LSTM(lstm_1,activation='tanh',kernel_regularizer=tf.keras.regularizers.l2(l2_1),
|
65 |
+
return_sequences=True))(input)
|
66 |
+
lay1=Dropout(drop_1)(lay1)
|
67 |
+
# print(lay1.shape)
|
68 |
+
|
69 |
+
lay2=Bidirectional(LSTM(lstm_2,activation='tanh',kernel_regularizer=tf.keras.regularizers.l2(l2_2),
|
70 |
+
return_sequences=True))(lay1)
|
71 |
+
lay2=Dropout(drop_2)(lay2)
|
72 |
+
# print(lay2.shape)
|
73 |
+
|
74 |
+
lay3=Bidirectional(LSTM(lstm_3,activation='tanh',kernel_regularizer=tf.keras.regularizers.l2(l2_3),
|
75 |
+
return_sequences=False))(lay2)
|
76 |
+
lay3=Dropout(drop_3)(lay3)
|
77 |
+
# print(lay3.shape)
|
78 |
+
|
79 |
+
output=Dense(1,activation='sigmoid')(lay3)
|
80 |
+
# print(output.shape)
|
81 |
+
original_task_model=Model(inputs=input,outputs=output,name='BILSTM_model')
|
82 |
+
# original_task_model.summary()
|
83 |
+
original_task_model.load_weights(ModelWeightFilepath)
|
84 |
+
|
85 |
+
print("Initialising Voice Activity Detection Model")
|
86 |
+
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
87 |
+
model='silero_vad',
|
88 |
+
force_reload=True)
|
89 |
+
|
90 |
+
(get_speech_timestamps,_, read_audio,*_) = utils
|
91 |
+
|
92 |
+
def speech_feature_melspect(speech_seg,Fs,frame_length,hop_length,s_frame_length,s_hop_length):
|
93 |
+
Feat=[]
|
94 |
+
Feature_min=[]
|
95 |
+
Feature_max=[]
|
96 |
+
index_start=0;
|
97 |
+
index_end=frame_length;
|
98 |
+
fft_length=int(2**np.ceil(np.log(int(s_frame_length))/np.log(2)))
|
99 |
+
speech_seg = lb.effects.preemphasis(speech_seg)
|
100 |
+
while index_end<len(speech_seg):
|
101 |
+
s_frame=speech_seg[range(index_start,index_end)]
|
102 |
+
cepst=lb.feature.melspectrogram(y=s_frame.reshape((-1,)),sr=Fs,n_fft=fft_length,win_length=s_frame_length,
|
103 |
+
hop_length=s_hop_length,window='hann',n_mels=60,power=1)
|
104 |
+
cepst=lb.power_to_db(cepst, ref=np.max)
|
105 |
+
Feat.append(cepst)
|
106 |
+
index_start += hop_length;
|
107 |
+
index_end += hop_length;
|
108 |
+
Feat=np.array(Feat)
|
109 |
+
return Feat
|
110 |
+
|
111 |
+
def read_speech_derive_vad (speech_file_path,sampling_rate,original_task_model,Feature_mean,Feature_std):
|
112 |
+
|
113 |
+
# sampling_rate = SAMPLING_RATE # also accepts 8000
|
114 |
+
wav = read_audio(speech_file_path, sampling_rate=SAMPLING_RATE)
|
115 |
+
# get speech timestamps from full audio file
|
116 |
+
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate)
|
117 |
+
index_vad = []
|
118 |
+
for item in speech_timestamps:
|
119 |
+
index_vad.extend([item['start'],item['end']])
|
120 |
+
if index_vad[0] != 0:
|
121 |
+
index_vad = [0] + index_vad
|
122 |
+
if index_vad[-1] != len(wav):
|
123 |
+
index_vad.append(len(wav))
|
124 |
+
else:
|
125 |
+
index_vad = index_vad[:-1]
|
126 |
+
index_vad = np.array(index_vad)
|
127 |
+
speech,Fs=lb.load(speech_file_path,sr=sampling_rate)
|
128 |
+
speech_scaled=speech/max(abs(speech))
|
129 |
+
return speech,speech_scaled,index_vad
|
130 |
+
|
131 |
+
def remove_small_breaths(index_b,threshold_breath,speech_b_detect):
|
132 |
+
for i in range(int(np.size(index_b)/2)):
|
133 |
+
b_length=index_b[0,2*i+1]-index_b[0,2*i]
|
134 |
+
if b_length <= threshold_breath:
|
135 |
+
speech_b_detect[range(int(index_b[0,2*i]),int(index_b[0,2*i+1])+1)]=0
|
136 |
+
|
137 |
+
index_b=np.argwhere(abs(np.diff(speech_b_detect))==1)
|
138 |
+
if speech_b_detect[0]==1:
|
139 |
+
index_b=np.insert(index_b,0,0)
|
140 |
+
|
141 |
+
if speech_b_detect[-1]==1:
|
142 |
+
index_b=np.append(index_b,len(speech_b_detect))
|
143 |
+
index_b=np.reshape(index_b,(1,-1))
|
144 |
+
|
145 |
+
return index_b,speech_b_detect
|
146 |
+
|
147 |
+
def join_close_breaths(index_b,threshold_breath_to_breath,speech_b_detect):
|
148 |
+
for i in range(int(np.size(index_b)/2)-1):
|
149 |
+
bb_length=index_b[0,2*i+2]-index_b[0,2*i+1]
|
150 |
+
if bb_length <= threshold_breath_to_breath:
|
151 |
+
speech_b_detect[range(int(index_b[0,2*i+1]),int(index_b[0,2*i+2])+1)]=1
|
152 |
+
|
153 |
+
index_b=np.argwhere(abs(np.diff(speech_b_detect))==1)
|
154 |
+
if speech_b_detect[0]==1:
|
155 |
+
index_b=np.insert(index_b,0,0)
|
156 |
+
|
157 |
+
if speech_b_detect[-1]==1:
|
158 |
+
index_b=np.append(index_b,len(speech_b_detect))
|
159 |
+
index_b=np.reshape(index_b,(1,-1))
|
160 |
+
frame_length=int(np.floor(FRAME_TIME*SAMPLING_RATE))
|
161 |
+
hop_length=int(np.floor(HOP_TIME*SAMPLING_RATE))
|
162 |
+
offset = frame_length - hop_length
|
163 |
+
for i in range(int(np.size(index_b)/2)):
|
164 |
+
index_b[2*i+1] = index_b[2*i+1] + offset
|
165 |
+
speech_b_detect[range(int(index_b[0,2*i]),int(index_b[0,2*i+1])+1)]=1
|
166 |
+
|
167 |
+
return index_b,speech_b_detect
|
168 |
+
|
169 |
+
# ***********************************************
|
170 |
+
def detect_breath_from_speed_vad(speech,index_vad):
|
171 |
+
index_vad=np.reshape(index_vad,(1,-1))
|
172 |
+
|
173 |
+
frame_length=int(np.floor(FRAME_TIME*SAMPLING_RATE))
|
174 |
+
hop_length=int(np.floor(HOP_TIME*SAMPLING_RATE))
|
175 |
+
s_frame_length=int(np.floor(S_FRAME_TIME*SAMPLING_RATE))
|
176 |
+
s_hop_length=int(np.floor(S_HOP_TIME*SAMPLING_RATE))
|
177 |
+
|
178 |
+
speech_b_detect=np.zeros(np.size(speech))
|
179 |
+
|
180 |
+
for vi in range(int(np.size(index_vad)/2)):
|
181 |
+
index_start=index_vad[0,2*vi]
|
182 |
+
index_end=index_vad[0,2*vi+1]
|
183 |
+
speech_seg=speech[index_start:index_end]
|
184 |
+
if (len(speech_seg)> frame_length+1):
|
185 |
+
feature=speech_feature_melspect(speech_seg, SAMPLING_RATE,
|
186 |
+
frame_length, hop_length,
|
187 |
+
s_frame_length, s_hop_length)
|
188 |
+
feature=(feature-Feature_mean)/Feature_std
|
189 |
+
prediction=original_task_model.predict(feature)
|
190 |
+
y_pred=np.array(list(map(int,prediction>classifier_threshold)))
|
191 |
+
if sum(y_pred)>2:
|
192 |
+
detect_point=np.argwhere(y_pred==1)
|
193 |
+
speech_b_detect[int(index_start+detect_point[0]*hop_length):int(index_start+(detect_point[-1]+1)*hop_length)]=1
|
194 |
+
|
195 |
+
index_b=np.argwhere(abs(np.diff(speech_b_detect))==1)
|
196 |
+
if speech_b_detect[0]==1:
|
197 |
+
index_b=np.insert(index_b,0,0)
|
198 |
+
|
199 |
+
if speech_b_detect[-1]==1:
|
200 |
+
index_b=np.append(index_b,len(speech_b_detect))
|
201 |
+
index_b=np.reshape(index_b,(1,-1))
|
202 |
+
index_b1=index_b.copy()
|
203 |
+
threshold_breath=BREATH_THRESHOLD*SAMPLING_RATE
|
204 |
+
threshold_breath_to_breath=BREATH_TO_BREATH_TIME*SAMPLING_RATE
|
205 |
+
|
206 |
+
if join==1:
|
207 |
+
index_b,speech_b_detect=join_close_breaths(index_b,threshold_breath_to_breath,speech_b_detect)
|
208 |
+
if remove==1:
|
209 |
+
index_b,speech_b_detect=remove_small_breaths(index_b,threshold_breath,speech_b_detect)
|
210 |
+
|
211 |
+
|
212 |
+
return speech_b_detect
|
213 |
+
|
214 |
+
def detect_breath_from_speed(speech_file_path,original_task_model,Feature_mean,Feature_std):
|
215 |
+
print("Finding Voice Activity Deteciton")
|
216 |
+
speech,speech_scaled,index_vad=read_speech_derive_vad(speech_file_path,SAMPLING_RATE,original_task_model,Feature_mean,Feature_std)
|
217 |
+
print("Detecting Breath sound in speech")
|
218 |
+
speech_b_detect=detect_breath_from_speed_vad(speech,index_vad)
|
219 |
+
return speech,speech_b_detect
|
220 |
+
|
221 |
+
def plot_waveform(speech,SAMPLING_RATE,speech_b_detect):
|
222 |
+
# Read the audio file
|
223 |
+
# Create the X values based on the length of the speech data and the sampling rate
|
224 |
+
X = np.divide(range(0, len(speech)), SAMPLING_RATE)
|
225 |
+
|
226 |
+
# Create a figure
|
227 |
+
plt.figure(figsize=(12, 8))
|
228 |
+
|
229 |
+
# Define font size
|
230 |
+
font_size = 24
|
231 |
+
|
232 |
+
# Second subplot: Speech, Detected breath, and True breath
|
233 |
+
plt.subplot(3, 1, 2)
|
234 |
+
plt.plot(X, speech, label="Speech", color='blue', linewidth=2)
|
235 |
+
plt.plot(X, 0.15 * speech_b_detect, label="Detected breath", color='red', linewidth=3)
|
236 |
+
plt.title(f"Speech and detected breaths", fontsize=24)
|
237 |
+
plt.legend(fontsize=12)
|
238 |
+
plt.xlabel("Time (seconds)", fontsize=20)
|
239 |
+
plt.ylabel("Amplitude", fontsize=20)
|
240 |
+
plt.grid(True)
|
241 |
+
|
242 |
+
# Save to a file
|
243 |
+
output_image_file = "waveform.png"
|
244 |
+
plt.savefig(output_image_file)
|
245 |
+
plt.close()
|
246 |
+
|
247 |
+
return output_image_file
|
248 |
+
# if __name__ == "__main__":
|
249 |
+
# speech_file_path = 'DATA\Introductory\C1W1L01.wav'
|
250 |
+
# original_task_model,Feature_mean,Feature_std = initialisation()
|
251 |
+
|
252 |
+
|
253 |
+
def gradio_interface(image_file,input_audio_file):
|
254 |
+
# Load the audio file
|
255 |
+
audio = AudioSegment.from_file(input_audio_file)
|
256 |
+
# Process the audio (e.g., normalize)
|
257 |
+
processed_audio = audio.normalize()
|
258 |
+
# Export the processed audio to a file
|
259 |
+
speech_file_path = "input_audio.wav"
|
260 |
+
processed_audio.export(speech_file_path, format="wav")
|
261 |
+
speech,speech_b_detect = detect_breath_from_speed(speech_file_path,original_task_model,Feature_mean,Feature_std)
|
262 |
+
breath_output = 50*np.multiply(speech,speech_b_detect)
|
263 |
+
breath_enhanced_speech = speech + breath_output
|
264 |
+
print("Writing output file")
|
265 |
+
output_audio_file = "Output/Breath_v1.wav"
|
266 |
+
sf.write(output_audio_file, breath_enhanced_speech, samplerate= SAMPLING_RATE,format='WAV')
|
267 |
+
output_image_file = plot_waveform(speech,SAMPLING_RATE,speech_b_detect)
|
268 |
+
return output_image_file,output_audio_file
|
269 |
+
|
270 |
+
# Create the Gradio interface
|
271 |
+
default_image = "Text.png"
|
272 |
+
iface = gr.Interface(
|
273 |
+
fn=gradio_interface,
|
274 |
+
inputs=[gr.Image(type="filepath", value=default_image,interactive=False),gr.Audio(sources=["microphone","upload"], type="filepath")],
|
275 |
+
outputs=[gr.Image(type="filepath"),gr.Audio(type="filepath")],
|
276 |
+
title="Breath sound Detector",
|
277 |
+
description="Record your speech reading the given paragraph. The audio will be processed and the breath detection will be performed. The detected breath will be displayed in the image and the breath enhanced speech can be heard.",
|
278 |
+
)
|
279 |
+
|
280 |
+
# Launch the Gradio interface
|
281 |
+
iface.launch()
|
requirements.txt
ADDED
Binary file (5.64 kB). View file
|
|