File size: 5,560 Bytes
51c17e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy as np
import librosa
import tensorflow as tf
import streamlit as st
import plotly.express as px
import pandas as pd
import soundfile as sf
from pydub import AudioSegment
import matplotlib.pyplot as plt
import time

# Parameters for audio processing
window_length = 0.02  # 20ms
hop_length = 0.0025  # 2.5ms
sample_rate = 22050
global inference_time
inference_time=1
# Load TFLite model
interpreter = tf.lite.Interpreter(model_path=r"model_breath_logspec_mfcc_cnn.tflite")
interpreter.allocate_tensors()
input_details, output_details = interpreter.get_input_details(), interpreter.get_output_details()

def convert_mp3_to_wav(mp3_path):
    audio = AudioSegment.from_mp3(mp3_path)
    wav_path = mp3_path.replace(".mp3", ".wav")
    audio.export(wav_path, format="wav")
    return wav_path

def extract_breath_features(y, sr):
    frame_length = int(window_length * sr)
    hop_length_samples = int(hop_length * sr)
    zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=frame_length, hop_length=hop_length_samples)
    rmse = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length_samples)
    breaths = (zcr.flatten() > 0.1) & (rmse.flatten() > 0.1)
    return np.where(breaths, 1, 0)

def extract_features(y, sr, n_mels=128, n_mfcc=13):
    try:
     
        # Extract MFCC & Log-Mel Spectrogram
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        logspec = librosa.power_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels))
        breath_feature = extract_breath_features(y, sr)

        # Dynamically adjust length based on the shortest feature
        min_len = min(mfcc.shape[1], logspec.shape[1], len(breath_feature))

        # Resize instead of fix_length for better visualization
        mfcc = librosa.util.fix_length(mfcc, size=min_len, axis=1)
        logspec = librosa.util.fix_length(logspec, size=min_len, axis=1)
        breath_feature = librosa.util.fix_length(breath_feature, size=min_len)

        return np.vstack((mfcc, logspec, breath_feature))
    except Exception as e:
        st.error(f"Error processing")
        return None
def prepare_single_data(features, max_len=500):
    features = librosa.util.fix_length(features, size=max_len, axis=1)
    return features[np.newaxis, ..., np.newaxis].astype(np.float32)

def predict_audio(features):
    global inference_time
    start_time=time.time()
    prepared_features = prepare_single_data(features)
    interpreter.set_tensor(input_details[0]['index'], prepared_features)
    interpreter.invoke()
    prediction = interpreter.get_tensor(output_details[0]['index'])
    end_time = time.time()
    inference_time=start_time-end_time
    return np.argmax(prediction, axis=1)[0], prediction[0]

def plot_waveform(y, sr, start_time, end_time):
    start_sample, end_sample = int(start_time * sr), int(end_time * sr)
    y_trimmed = y[start_sample:end_sample]
    times = np.linspace(start_time, end_time, num=len(y_trimmed))
    df = pd.DataFrame({"Time (s)": times, "Amplitude": y_trimmed})
    st.plotly_chart(px.line(df, x="Time (s)", y="Amplitude", title="Waveform"), use_container_width=True)
    return y_trimmed, sr

def visualize_features(features, duration):
    time_axis = np.linspace(0, duration, features.shape[1])
    df_breath = pd.DataFrame({"Time (s)": time_axis, "Breath Feature": features[-1]})
    st.plotly_chart(px.line(df_breath, x="Time (s)", y="Breath Feature", title="Breath Feature Over Time"), use_container_width=True)

def plot_mfcc_and_logspec(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    logspec = librosa.amplitude_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128))
    #logspec = librosa.power_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128))
    with st.sidebar:
        st.write("### MFCC and Log-Spectrogram")

        # Plot MFCC
        fig, ax = plt.subplots(figsize=(5, 3))
        librosa.display.specshow(mfcc, sr=sr, x_axis='time')
        plt.colorbar()
        plt.title("MFCC")
        st.pyplot(fig)

        # Plot Log-Spectrogram
        fig, ax = plt.subplots(figsize=(5, 3))
        librosa.display.specshow(logspec, sr=sr, x_axis='time')
        plt.colorbar()
        plt.title("Log-Mel Spectrogram")
        st.pyplot(fig)

st.title('Audio Integrity Net')
st.subheader('Documentation to be added')
uploaded_file = st.file_uploader('Upload an audio file', type=['wav', 'mp3'])

if uploaded_file:
    with open('temp_audio.wav', 'wb') as f:
        f.write(uploaded_file.getbuffer())
    
    y, sr = librosa.load('temp_audio.wav', sr=sample_rate)
    duration = librosa.get_duration(y=y, sr=sr)
    start_time, end_time = st.slider("Select time range", 0.0, duration, (0.0, duration))
    y_trimmed, sr = plot_waveform(y, sr, start_time, end_time)
    
    if st.sidebar.button("Show MFCC & Log-Spectrogram"):
        plot_mfcc_and_logspec(y_trimmed, sr)
    
    features = extract_features(y_trimmed, sr)
    if features is not None:
        st.success("Feature Extraction Completed!")
        visualize_features(features, end_time - start_time)
        prediction, probability = predict_audio(features)
        if prediction==0:
            st.subheader(f' Predicted class is Real ')
        else:
            st.subheader(f'Predicted class is Fake')
        st.write(f'Probability of being real: {probability[0] * 100:.2f}%')
        st.write(f'Probability of being fake: {probability[1] * 100:.2f}%')
        inference_time=abs(inference_time)
        st.write(f"Inference Time: {inference_time:.6f} seconds")