import streamlit as st from huggingface_hub import hf_hub_url, cached_download import torch import torchaudio.transforms as transforms from miniaudio import SampleFormat, decode from librosa.util import fix_length import numpy as np from audio_recorder_streamlit import audio_recorder # Streamlit app title st.markdown("## Noisy Human") st.markdown("") st.markdown( "Non-speach human sounds classification. This model can identify with up to 78/% accuracy the following 10 classes" ) col1, col2 = st.columns(2) with st.container(): with col1: st.markdown( """ * Clapping 👏 * Footsteps 🦶 * Brushing Teeth 🪥 * Drinking Sipping 🧃 * Laughing 😂 """ ) with col2: st.markdown( """ * Breathing 🌬️ * Crying Baby 😭 * Coughing 🤧 * Snoring 😴 * Sneezing 🤧 """ ) # from audio_recorder_streamlit import audio_recorder from cnn import CNN REPO_ID = "santiviquez/noisy_human_cnn" FILENAME = "CNN_MelSpec_Deltas_fold_4_.pth" RATE = 22050 @st.cache(allow_output_mutation=True) def download_model(): model_weights = torch.load( cached_download(hf_hub_url(REPO_ID, FILENAME)), map_location=torch.device("cpu") ) return model_weights model_weights = download_model() model = CNN(input_channels=2) model.load_state_dict(model_weights) model.eval() audio_bytes = st.file_uploader( "Choose an audio (.wav) file", accept_multiple_files=False ) st.caption("OR") audio_bytes = audio_recorder() if audio_bytes: # audio_bytes = audio_file_path.read() st.audio(audio_bytes, format="audio/ogg") # st.audio(audio_bytes, format="audio/ogg") # torch.tensor(audio_bytes).shape decoded_audio = decode( audio_bytes, nchannels=1, sample_rate=RATE, output_format=SampleFormat.SIGNED32 ) waveform = np.array(decoded_audio.samples) waveform = fix_length(waveform, size=5 * RATE) waveform = torch.FloatTensor(waveform) x_mel = transforms.MelSpectrogram(sample_rate=RATE, n_fft=1024, n_mels=60)(waveform) x_deltas = transforms.ComputeDeltas()(x_mel) x = torch.cat((x_mel, x_deltas)).view(1, 2, 60, 216) y_pred = model(x) y_pred_softmax = torch.log_softmax(y_pred, dim=1) _, y_pred_tags = torch.max(y_pred_softmax, dim=1) category_map = { 0: "Clapping 👏", 1: "Footsteps 🦶", 2: "Brushing Teeth 🪥", 3: "Drinking Sipping 🧃", 4: "Laughing 😂", 5: "Breathing 🌬️", 6: "Crying Baby 😭", 7: "Coughing 🤧", 8: "Snoring 😴", 9: "Sneezing 🤧", } st.write("**Predicted class:**", category_map[y_pred_tags.item()]) st.text("") st.text("") st.text("") st.markdown( """`Create by` [Santiago Viquez](https://twitter.com/santiviquez) and [Ivan Padezhki](https://github.com/ivanpadezhki) | `Code:` [GitHub](https://github.com/santiviquez/noisy-human-recognition)""" )