File size: 3,094 Bytes
29457c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0929195
29457c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eca2ad1
29457c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import streamlit as st
from huggingface_hub import hf_hub_url, cached_download
import torch
import torchaudio.transforms as transforms
from miniaudio import SampleFormat, decode
from librosa.util import fix_length
import numpy as np
from audio_recorder_streamlit import audio_recorder


# Streamlit app title
st.markdown("## Noisy Human")
st.markdown("")
st.markdown(
    "Non-speach human sounds classification. This model can identify with up to 78/% accuracy the following 10 classes"
)

col1, col2 = st.columns(2)
with st.container():
    with col1:
        st.markdown(
            """
                    * Clapping πŸ‘
                    * Footsteps 🦢
                    * Brushing Teeth πŸͺ₯
                    * Drinking Sipping πŸ§ƒ
                    * Laughing πŸ˜‚
                    """
        )

    with col2:
        st.markdown(
            """

    * Breathing 🌬️
    * Crying Baby 😭
    * Coughing 🀧
    * Snoring 😴
    * Sneezing 🀧
    """
        )

# from audio_recorder_streamlit import audio_recorder

from cnn import CNN

REPO_ID = "santiviquez/noisy_human_cnn"
FILENAME = "CNN_MelSpec_Deltas_fold_4_.pth"
RATE = 22050


@st.cache(allow_output_mutation=True)
def download_model():
    model_weights = torch.load(
        cached_download(hf_hub_url(REPO_ID, FILENAME)), map_location=torch.device("cpu")
    )
    return model_weights


model_weights = download_model()
model = CNN(input_channels=2)
model.load_state_dict(model_weights)
model.eval()
audio_bytes = st.file_uploader(
    "Choose an audio (.wav) file", accept_multiple_files=False
)
st.caption("OR")
audio_bytes = audio_recorder()

if audio_bytes:
    # audio_bytes = audio_file_path.read()
    st.audio(audio_bytes, format="audio/ogg")
    # st.audio(audio_bytes, format="audio/ogg")
    # torch.tensor(audio_bytes).shape
    decoded_audio = decode(
        audio_bytes, nchannels=1, sample_rate=RATE, output_format=SampleFormat.SIGNED32
    )

    waveform = np.array(decoded_audio.samples)
    waveform = fix_length(waveform, size=5 * RATE)
    waveform = torch.FloatTensor(waveform)

    x_mel = transforms.MelSpectrogram(sample_rate=RATE, n_fft=1024, n_mels=60)(waveform)
    x_deltas = transforms.ComputeDeltas()(x_mel)
    x = torch.cat((x_mel, x_deltas)).view(1, 2, 60, 216)

    y_pred = model(x)
    y_pred_softmax = torch.log_softmax(y_pred, dim=1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim=1)

    category_map = {
        0: "Clapping πŸ‘",
        1: "Footsteps 🦢",
        2: "Brushing Teeth πŸͺ₯",
        3: "Drinking Sipping πŸ§ƒ",
        4: "Laughing πŸ˜‚",
        5: "Breathing 🌬️",
        6: "Crying Baby 😭",
        7: "Coughing 🀧",
        8: "Snoring 😴",
        9: "Sneezing 🀧",
    }

    st.write("**Predicted class:**", category_map[y_pred_tags.item()])

st.text("")
st.text("")
st.text("")
st.markdown(
    """`Create by` [Santiago Viquez](https://twitter.com/santiviquez) 
    and [Ivan Padezhki](https://github.com/ivanpadezhki) 
    |  `Code:` [GitHub](https://github.com/santiviquez/noisy-human-recognition)"""
)