File size: 3,537 Bytes
782d9c8 6e388d5 782d9c8 6e388d5 782d9c8 62aef5e 782d9c8 62aef5e 782d9c8 62aef5e 782d9c8 62aef5e 782d9c8 62aef5e 782d9c8 62aef5e 782d9c8 62aef5e 782d9c8 62aef5e 782d9c8 6e388d5 782d9c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import streamlit as st
import matplotlib.pyplot as plt
import librosa
import librosa.display
import numpy as np
from matplotlib.colors import ListedColormap
import torch
from pprint import pprint
import tempfile
import helper as hp
from io import BytesIO
reset = False
st.title("Human Voice Activity Detector")
# record audio
st.subheader("Record Audio From Microphone")
with st.form("enter_info_form"):
filename = st.text_input("FILENAME")+".wav"
duration = st.number_input("DURATION", min_value=0)
record_button = st.form_submit_button("Record")
st.session_state["recording_done"] = False
if record_button:
if "recording_state" not in st.session_state:
st.session_state["recording_state"] = True
try:
hp.record_Audio(filename, duration)
# reading the conent of the audio file
with open(filename, 'rb') as file:
audio_content = file.read()
audio_file = BytesIO(audio_content) # converting it to BytesIO format
st.download_button(
label=f"Download {filename}",
data = audio_file,
file_name=filename,
mime="audio/wav",
)
except ValueError as e:
st.error(str(e))
# TODO
#upload audio file with streamlit
else:
audio_file = st.file_uploader("Upload Audio", type=["wav"])
reset = True
if audio_file is not None:
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(audio_file.getvalue())
tmp_file_name = tmp_file.name
tmp_file.close()
plt.figure(figsize = (14,5))
data, sample_rate = librosa.load(tmp_file_name,sr=16000)
plt.figure(figsize=(10, 4))
librosa.display.waveshow(data, sr=16000)
plt.title("Waveform")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
plt.tight_layout()
st.audio(data, format="audio/wav", sample_rate=sample_rate)
st.caption("Raw Audio Waveform")
st.pyplot(plt)
with st.spinner('Processing the audio file...'):
torch.set_num_threads(1)
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=True)
(get_speech_timestamps,
_, read_audio,
*_) = utils
sampling_rate = 16000
wav = read_audio(audio_file, sampling_rate=sampling_rate) #type(wav) = <class 'torch.Tensor'>
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate)
plt.figure(figsize = (14,5))
librosa.display.waveshow(np.array(wav), sr = sampling_rate)
if len(speech_timestamps) != 0:
plt.title("Detected Speech Segments")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
for timestamp in speech_timestamps:
start_time = timestamp['start'] / sampling_rate
end_time = timestamp['end'] / sampling_rate
plt.axvspan(start_time, end_time, alpha=0.5, color='gray', label='Detected Speech')
st.success("Speech Segments Detected!")
st.caption("Model Output with Detected Speech Segments")
st.pyplot(plt)
else:
print("No Speech Detected")
st.error("No Speech Detected")
if st.session_state['recording_done'] or reset:
if st.button("Reset", ):
st.session_state["recording_state"] = False
st.rerun() |