2000prasanth's picture
Update app.py
51c17e6 verified
import numpy as np
import librosa
import tensorflow as tf
import streamlit as st
import plotly.express as px
import pandas as pd
import soundfile as sf
from pydub import AudioSegment
import matplotlib.pyplot as plt
import time
# Parameters for audio processing
window_length = 0.02 # 20ms
hop_length = 0.0025 # 2.5ms
sample_rate = 22050
global inference_time
inference_time=1
# Load TFLite model
interpreter = tf.lite.Interpreter(model_path=r"model_breath_logspec_mfcc_cnn.tflite")
interpreter.allocate_tensors()
input_details, output_details = interpreter.get_input_details(), interpreter.get_output_details()
def convert_mp3_to_wav(mp3_path):
audio = AudioSegment.from_mp3(mp3_path)
wav_path = mp3_path.replace(".mp3", ".wav")
audio.export(wav_path, format="wav")
return wav_path
def extract_breath_features(y, sr):
frame_length = int(window_length * sr)
hop_length_samples = int(hop_length * sr)
zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=frame_length, hop_length=hop_length_samples)
rmse = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length_samples)
breaths = (zcr.flatten() > 0.1) & (rmse.flatten() > 0.1)
return np.where(breaths, 1, 0)
def extract_features(y, sr, n_mels=128, n_mfcc=13):
try:
# Extract MFCC & Log-Mel Spectrogram
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
logspec = librosa.power_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels))
breath_feature = extract_breath_features(y, sr)
# Dynamically adjust length based on the shortest feature
min_len = min(mfcc.shape[1], logspec.shape[1], len(breath_feature))
# Resize instead of fix_length for better visualization
mfcc = librosa.util.fix_length(mfcc, size=min_len, axis=1)
logspec = librosa.util.fix_length(logspec, size=min_len, axis=1)
breath_feature = librosa.util.fix_length(breath_feature, size=min_len)
return np.vstack((mfcc, logspec, breath_feature))
except Exception as e:
st.error(f"Error processing")
return None
def prepare_single_data(features, max_len=500):
features = librosa.util.fix_length(features, size=max_len, axis=1)
return features[np.newaxis, ..., np.newaxis].astype(np.float32)
def predict_audio(features):
global inference_time
start_time=time.time()
prepared_features = prepare_single_data(features)
interpreter.set_tensor(input_details[0]['index'], prepared_features)
interpreter.invoke()
prediction = interpreter.get_tensor(output_details[0]['index'])
end_time = time.time()
inference_time=start_time-end_time
return np.argmax(prediction, axis=1)[0], prediction[0]
def plot_waveform(y, sr, start_time, end_time):
start_sample, end_sample = int(start_time * sr), int(end_time * sr)
y_trimmed = y[start_sample:end_sample]
times = np.linspace(start_time, end_time, num=len(y_trimmed))
df = pd.DataFrame({"Time (s)": times, "Amplitude": y_trimmed})
st.plotly_chart(px.line(df, x="Time (s)", y="Amplitude", title="Waveform"), use_container_width=True)
return y_trimmed, sr
def visualize_features(features, duration):
time_axis = np.linspace(0, duration, features.shape[1])
df_breath = pd.DataFrame({"Time (s)": time_axis, "Breath Feature": features[-1]})
st.plotly_chart(px.line(df_breath, x="Time (s)", y="Breath Feature", title="Breath Feature Over Time"), use_container_width=True)
def plot_mfcc_and_logspec(y, sr):
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
logspec = librosa.amplitude_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128))
#logspec = librosa.power_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128))
with st.sidebar:
st.write("### MFCC and Log-Spectrogram")
# Plot MFCC
fig, ax = plt.subplots(figsize=(5, 3))
librosa.display.specshow(mfcc, sr=sr, x_axis='time')
plt.colorbar()
plt.title("MFCC")
st.pyplot(fig)
# Plot Log-Spectrogram
fig, ax = plt.subplots(figsize=(5, 3))
librosa.display.specshow(logspec, sr=sr, x_axis='time')
plt.colorbar()
plt.title("Log-Mel Spectrogram")
st.pyplot(fig)
st.title('Audio Integrity Net')
st.subheader('Documentation to be added')
uploaded_file = st.file_uploader('Upload an audio file', type=['wav', 'mp3'])
if uploaded_file:
with open('temp_audio.wav', 'wb') as f:
f.write(uploaded_file.getbuffer())
y, sr = librosa.load('temp_audio.wav', sr=sample_rate)
duration = librosa.get_duration(y=y, sr=sr)
start_time, end_time = st.slider("Select time range", 0.0, duration, (0.0, duration))
y_trimmed, sr = plot_waveform(y, sr, start_time, end_time)
if st.sidebar.button("Show MFCC & Log-Spectrogram"):
plot_mfcc_and_logspec(y_trimmed, sr)
features = extract_features(y_trimmed, sr)
if features is not None:
st.success("Feature Extraction Completed!")
visualize_features(features, end_time - start_time)
prediction, probability = predict_audio(features)
if prediction==0:
st.subheader(f' Predicted class is Real ')
else:
st.subheader(f'Predicted class is Fake')
st.write(f'Probability of being real: {probability[0] * 100:.2f}%')
st.write(f'Probability of being fake: {probability[1] * 100:.2f}%')
inference_time=abs(inference_time)
st.write(f"Inference Time: {inference_time:.6f} seconds")