|
import numpy as np |
|
import librosa |
|
import tensorflow as tf |
|
import streamlit as st |
|
import plotly.express as px |
|
import pandas as pd |
|
import soundfile as sf |
|
from pydub import AudioSegment |
|
import matplotlib.pyplot as plt |
|
import time |
|
|
|
|
|
window_length = 0.02 |
|
hop_length = 0.0025 |
|
sample_rate = 22050 |
|
global inference_time |
|
inference_time=1 |
|
|
|
interpreter = tf.lite.Interpreter(model_path=r"model_breath_logspec_mfcc_cnn.tflite") |
|
interpreter.allocate_tensors() |
|
input_details, output_details = interpreter.get_input_details(), interpreter.get_output_details() |
|
|
|
def convert_mp3_to_wav(mp3_path): |
|
audio = AudioSegment.from_mp3(mp3_path) |
|
wav_path = mp3_path.replace(".mp3", ".wav") |
|
audio.export(wav_path, format="wav") |
|
return wav_path |
|
|
|
def extract_breath_features(y, sr): |
|
frame_length = int(window_length * sr) |
|
hop_length_samples = int(hop_length * sr) |
|
zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=frame_length, hop_length=hop_length_samples) |
|
rmse = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length_samples) |
|
breaths = (zcr.flatten() > 0.1) & (rmse.flatten() > 0.1) |
|
return np.where(breaths, 1, 0) |
|
|
|
def extract_features(y, sr, n_mels=128, n_mfcc=13): |
|
try: |
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc) |
|
logspec = librosa.power_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)) |
|
breath_feature = extract_breath_features(y, sr) |
|
|
|
|
|
min_len = min(mfcc.shape[1], logspec.shape[1], len(breath_feature)) |
|
|
|
|
|
mfcc = librosa.util.fix_length(mfcc, size=min_len, axis=1) |
|
logspec = librosa.util.fix_length(logspec, size=min_len, axis=1) |
|
breath_feature = librosa.util.fix_length(breath_feature, size=min_len) |
|
|
|
return np.vstack((mfcc, logspec, breath_feature)) |
|
except Exception as e: |
|
st.error(f"Error processing") |
|
return None |
|
def prepare_single_data(features, max_len=500): |
|
features = librosa.util.fix_length(features, size=max_len, axis=1) |
|
return features[np.newaxis, ..., np.newaxis].astype(np.float32) |
|
|
|
def predict_audio(features): |
|
global inference_time |
|
start_time=time.time() |
|
prepared_features = prepare_single_data(features) |
|
interpreter.set_tensor(input_details[0]['index'], prepared_features) |
|
interpreter.invoke() |
|
prediction = interpreter.get_tensor(output_details[0]['index']) |
|
end_time = time.time() |
|
inference_time=start_time-end_time |
|
return np.argmax(prediction, axis=1)[0], prediction[0] |
|
|
|
def plot_waveform(y, sr, start_time, end_time): |
|
start_sample, end_sample = int(start_time * sr), int(end_time * sr) |
|
y_trimmed = y[start_sample:end_sample] |
|
times = np.linspace(start_time, end_time, num=len(y_trimmed)) |
|
df = pd.DataFrame({"Time (s)": times, "Amplitude": y_trimmed}) |
|
st.plotly_chart(px.line(df, x="Time (s)", y="Amplitude", title="Waveform"), use_container_width=True) |
|
return y_trimmed, sr |
|
|
|
def visualize_features(features, duration): |
|
time_axis = np.linspace(0, duration, features.shape[1]) |
|
df_breath = pd.DataFrame({"Time (s)": time_axis, "Breath Feature": features[-1]}) |
|
st.plotly_chart(px.line(df_breath, x="Time (s)", y="Breath Feature", title="Breath Feature Over Time"), use_container_width=True) |
|
|
|
def plot_mfcc_and_logspec(y, sr): |
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) |
|
logspec = librosa.amplitude_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)) |
|
|
|
with st.sidebar: |
|
st.write("### MFCC and Log-Spectrogram") |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(5, 3)) |
|
librosa.display.specshow(mfcc, sr=sr, x_axis='time') |
|
plt.colorbar() |
|
plt.title("MFCC") |
|
st.pyplot(fig) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(5, 3)) |
|
librosa.display.specshow(logspec, sr=sr, x_axis='time') |
|
plt.colorbar() |
|
plt.title("Log-Mel Spectrogram") |
|
st.pyplot(fig) |
|
|
|
st.title('Audio Integrity Net') |
|
st.subheader('Documentation to be added') |
|
uploaded_file = st.file_uploader('Upload an audio file', type=['wav', 'mp3']) |
|
|
|
if uploaded_file: |
|
with open('temp_audio.wav', 'wb') as f: |
|
f.write(uploaded_file.getbuffer()) |
|
|
|
y, sr = librosa.load('temp_audio.wav', sr=sample_rate) |
|
duration = librosa.get_duration(y=y, sr=sr) |
|
start_time, end_time = st.slider("Select time range", 0.0, duration, (0.0, duration)) |
|
y_trimmed, sr = plot_waveform(y, sr, start_time, end_time) |
|
|
|
if st.sidebar.button("Show MFCC & Log-Spectrogram"): |
|
plot_mfcc_and_logspec(y_trimmed, sr) |
|
|
|
features = extract_features(y_trimmed, sr) |
|
if features is not None: |
|
st.success("Feature Extraction Completed!") |
|
visualize_features(features, end_time - start_time) |
|
prediction, probability = predict_audio(features) |
|
if prediction==0: |
|
st.subheader(f' Predicted class is Real ') |
|
else: |
|
st.subheader(f'Predicted class is Fake') |
|
st.write(f'Probability of being real: {probability[0] * 100:.2f}%') |
|
st.write(f'Probability of being fake: {probability[1] * 100:.2f}%') |
|
inference_time=abs(inference_time) |
|
st.write(f"Inference Time: {inference_time:.6f} seconds") |