|
import streamlit as st |
|
import matplotlib.pyplot as plt |
|
import librosa |
|
import librosa.display |
|
import numpy as np |
|
from matplotlib.colors import ListedColormap |
|
import torch |
|
from pprint import pprint |
|
import tempfile |
|
import helper as hp |
|
from io import BytesIO |
|
|
|
reset = False |
|
|
|
st.title("Human Voice Activity Detector") |
|
|
|
|
|
st.subheader("Record Audio From Microphone") |
|
with st.form("enter_info_form"): |
|
filename = st.text_input("FILENAME")+".wav" |
|
duration = st.number_input("DURATION", min_value=0) |
|
record_button = st.form_submit_button("Record") |
|
|
|
|
|
st.session_state["recording_done"] = False |
|
|
|
if record_button: |
|
|
|
if "recording_state" not in st.session_state: |
|
st.session_state["recording_state"] = True |
|
|
|
try: |
|
hp.record_Audio(filename, duration) |
|
|
|
|
|
with open(filename, 'rb') as file: |
|
audio_content = file.read() |
|
audio_file = BytesIO(audio_content) |
|
|
|
st.download_button( |
|
label=f"Download {filename}", |
|
data = audio_file, |
|
file_name=filename, |
|
mime="audio/wav", |
|
) |
|
|
|
except ValueError as e: |
|
st.error(str(e)) |
|
|
|
|
|
else: |
|
audio_file = st.file_uploader("Upload Audio", type=["wav"]) |
|
reset = True |
|
|
|
if audio_file is not None: |
|
|
|
with tempfile.NamedTemporaryFile(delete=False) as tmp_file: |
|
tmp_file.write(audio_file.getvalue()) |
|
|
|
tmp_file_name = tmp_file.name |
|
|
|
|
|
tmp_file.close() |
|
|
|
plt.figure(figsize = (14,5)) |
|
data, sample_rate = librosa.load(tmp_file_name,sr=16000) |
|
|
|
plt.figure(figsize=(10, 4)) |
|
librosa.display.waveshow(data, sr=16000) |
|
plt.title("Waveform") |
|
plt.xlabel("Time (s)") |
|
plt.ylabel("Amplitude") |
|
plt.tight_layout() |
|
|
|
|
|
st.audio(data, format="audio/wav", sample_rate=sample_rate) |
|
st.caption("Raw Audio Waveform") |
|
st.pyplot(plt) |
|
|
|
with st.spinner('Processing the audio file...'): |
|
torch.set_num_threads(1) |
|
|
|
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', |
|
model='silero_vad', |
|
force_reload=True) |
|
|
|
(get_speech_timestamps, |
|
_, read_audio, |
|
*_) = utils |
|
|
|
sampling_rate = 16000 |
|
wav = read_audio(audio_file, sampling_rate=sampling_rate) |
|
|
|
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate) |
|
|
|
|
|
plt.figure(figsize = (14,5)) |
|
|
|
librosa.display.waveshow(np.array(wav), sr = sampling_rate) |
|
if len(speech_timestamps) != 0: |
|
plt.title("Detected Speech Segments") |
|
plt.xlabel("Time (s)") |
|
plt.ylabel("Amplitude") |
|
for timestamp in speech_timestamps: |
|
start_time = timestamp['start'] / sampling_rate |
|
end_time = timestamp['end'] / sampling_rate |
|
plt.axvspan(start_time, end_time, alpha=0.5, color='gray', label='Detected Speech') |
|
|
|
st.success("Speech Segments Detected!") |
|
st.caption("Model Output with Detected Speech Segments") |
|
st.pyplot(plt) |
|
else: |
|
print("No Speech Detected") |
|
st.error("No Speech Detected") |
|
|
|
if st.session_state['recording_done'] or reset: |
|
if st.button("Reset", ): |
|
st.session_state["recording_state"] = False |
|
st.rerun() |