Mridul
commited on
Commit
•
782d9c8
1
Parent(s):
a380e3b
Adding the initial files
Browse files- app.py +119 -0
- helper.py +53 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import librosa
|
4 |
+
import librosa.display
|
5 |
+
import numpy as np
|
6 |
+
from matplotlib.colors import ListedColormap
|
7 |
+
import torch
|
8 |
+
from pprint import pprint
|
9 |
+
import tempfile
|
10 |
+
import sounddevice as sd
|
11 |
+
import helper as hp
|
12 |
+
from io import BytesIO
|
13 |
+
|
14 |
+
magicEnabled = False
|
15 |
+
|
16 |
+
st.title("Human Voice Activity Detector")
|
17 |
+
|
18 |
+
# record audio
|
19 |
+
st.subheader("Record Audio From Microphone")
|
20 |
+
with st.form("enter_info_form"):
|
21 |
+
filename = st.text_input("FILENAME")+".wav"
|
22 |
+
duration = st.number_input("DURATION", min_value=0)
|
23 |
+
record_button = st.form_submit_button("Record")
|
24 |
+
|
25 |
+
|
26 |
+
st.session_state["recording_done"] = False
|
27 |
+
|
28 |
+
if record_button:
|
29 |
+
|
30 |
+
if "recording_state" not in st.session_state:
|
31 |
+
st.session_state["recording_state"] = True
|
32 |
+
|
33 |
+
try:
|
34 |
+
hp.record_Audio(filename, duration)
|
35 |
+
|
36 |
+
# reading the conent of the audio file
|
37 |
+
with open(filename, 'rb') as file:
|
38 |
+
audio_content = file.read()
|
39 |
+
audio_file = BytesIO(audio_content) # converting it to BytesIO format
|
40 |
+
|
41 |
+
st.download_button(
|
42 |
+
label=f"Download {filename}",
|
43 |
+
data = audio_file,
|
44 |
+
file_name=filename,
|
45 |
+
mime="audio/wav",
|
46 |
+
)
|
47 |
+
|
48 |
+
except ValueError as e:
|
49 |
+
st.error(str(e))
|
50 |
+
# TODO
|
51 |
+
#upload audio file with streamlit
|
52 |
+
else:
|
53 |
+
audio_file = st.file_uploader("Upload Audio", type=["wav"])
|
54 |
+
|
55 |
+
if audio_file is not None:
|
56 |
+
# Save the uploaded audio file to a temporary file
|
57 |
+
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
58 |
+
tmp_file.write(audio_file.getvalue())
|
59 |
+
# tmp_file.write(audio_file.read())
|
60 |
+
tmp_file_name = tmp_file.name
|
61 |
+
|
62 |
+
# audio_file.seek(0) # Seek to the beginning of the file
|
63 |
+
tmp_file.close()
|
64 |
+
# print(audio_file)
|
65 |
+
plt.figure(figsize = (14,5))
|
66 |
+
data, sample_rate = librosa.load(tmp_file_name,sr=16000)
|
67 |
+
# Plot the waveform
|
68 |
+
plt.figure(figsize=(10, 4))
|
69 |
+
librosa.display.waveshow(data, sr=16000)
|
70 |
+
plt.title("Waveform")
|
71 |
+
plt.xlabel("Time (s)")
|
72 |
+
plt.ylabel("Amplitude")
|
73 |
+
plt.tight_layout()
|
74 |
+
|
75 |
+
# Display the plot in Streamlit
|
76 |
+
st.audio(data, format="audio/wav", sample_rate=sample_rate)
|
77 |
+
st.caption("Raw Audio Waveform")
|
78 |
+
st.pyplot(plt)
|
79 |
+
|
80 |
+
with st.spinner('Processing the audio file...'):
|
81 |
+
torch.set_num_threads(1)
|
82 |
+
|
83 |
+
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
84 |
+
model='silero_vad',
|
85 |
+
force_reload=True)
|
86 |
+
|
87 |
+
(get_speech_timestamps,
|
88 |
+
_, read_audio,
|
89 |
+
*_) = utils
|
90 |
+
|
91 |
+
sampling_rate = 16000
|
92 |
+
wav = read_audio(audio_file, sampling_rate=sampling_rate) #type(wav) = <class 'torch.Tensor'>
|
93 |
+
# print(wav)
|
94 |
+
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate)
|
95 |
+
# pprint(speech_timestamps)
|
96 |
+
|
97 |
+
plt.figure(figsize = (14,5))
|
98 |
+
# data,sample_rate = librosa.load(local_audio_file_path, sr=sampling_rate)
|
99 |
+
librosa.display.waveshow(np.array(wav), sr = sampling_rate)
|
100 |
+
if len(speech_timestamps) != 0:
|
101 |
+
plt.title("Detected Speech Segments")
|
102 |
+
plt.xlabel("Time (s)")
|
103 |
+
plt.ylabel("Amplitude")
|
104 |
+
for timestamp in speech_timestamps:
|
105 |
+
start_time = timestamp['start'] / sampling_rate
|
106 |
+
end_time = timestamp['end'] / sampling_rate
|
107 |
+
plt.axvspan(start_time, end_time, alpha=0.5, color='gray', label='Detected Speech')
|
108 |
+
|
109 |
+
st.success("Speech Segments Detected!")
|
110 |
+
st.caption("Model Output with Detected Speech Segments")
|
111 |
+
st.pyplot(plt)
|
112 |
+
else:
|
113 |
+
print("No Speech Detected")
|
114 |
+
st.error("No Speech Detected")
|
115 |
+
|
116 |
+
if st.session_state['recording_done']:
|
117 |
+
if st.button("Reset", ):
|
118 |
+
st.session_state["recording_state"] = False
|
119 |
+
st.rerun()
|
helper.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pyaudio import paInt16, PyAudio
|
3 |
+
import wave
|
4 |
+
|
5 |
+
def record_Audio(filename, duration):
|
6 |
+
"""
|
7 |
+
A audio-recording helping function Using PyAudio
|
8 |
+
"""
|
9 |
+
|
10 |
+
if not filename:
|
11 |
+
raise ValueError("Filename not specified. Please provide a filename!")
|
12 |
+
|
13 |
+
CHUNK = 1024
|
14 |
+
FORMAT = paInt16
|
15 |
+
CHANNELS = 1
|
16 |
+
RATE = 16000
|
17 |
+
RECORD_TIME = duration
|
18 |
+
|
19 |
+
recording_state = st.session_state.get("recording_state", False)
|
20 |
+
recording_info_placeholder = st.empty()
|
21 |
+
if recording_state:
|
22 |
+
|
23 |
+
recording_info_placeholder.info("Recording... ")
|
24 |
+
|
25 |
+
|
26 |
+
with wave.open(filename, 'wb') as f:
|
27 |
+
p = PyAudio()
|
28 |
+
f.setnchannels(CHANNELS)
|
29 |
+
f.setsampwidth(p.get_sample_size(FORMAT))
|
30 |
+
f.setframerate(RATE)
|
31 |
+
|
32 |
+
stream = p.open(format=FORMAT,
|
33 |
+
channels=CHANNELS,
|
34 |
+
rate=RATE,
|
35 |
+
input=True)
|
36 |
+
|
37 |
+
if recording_state:
|
38 |
+
stop_button = st.button("Stop Recording")
|
39 |
+
|
40 |
+
for _ in range(0, RATE // CHUNK * RECORD_TIME):
|
41 |
+
|
42 |
+
f.writeframes(stream.read(CHUNK))
|
43 |
+
|
44 |
+
if stop_button:
|
45 |
+
break
|
46 |
+
|
47 |
+
|
48 |
+
recording_info_placeholder.success("Recording Completed\nThese are the results:")
|
49 |
+
|
50 |
+
st.session_state["recording_done"] = True
|
51 |
+
|
52 |
+
stream.close()
|
53 |
+
p.terminate()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.28.0
|
2 |
+
matplotlib==3.7.2
|
3 |
+
librosa==0.10.0.post2
|
4 |
+
numpy==1.24.3
|
5 |
+
torch==2.0.1
|
6 |
+
torchaudio==2.0.2
|
7 |
+
pyaudio==0.2.13
|
8 |
+
wave==0.0.2
|