Spaces:

2000prasanth
/

audio_deepfake_detection_audiointegritynet

Running

audio_deepfake_detection_audiointegritynet

File size: 5,560 Bytes

51c17e6

import numpy as np
import librosa
import tensorflow as tf
import streamlit as st
import plotly.express as px
import pandas as pd
import soundfile as sf
from pydub import AudioSegment
import matplotlib.pyplot as plt
import time

# Parameters for audio processing
window_length = 0.02  # 20ms
hop_length = 0.0025  # 2.5ms
sample_rate = 22050
global inference_time
inference_time=1
# Load TFLite model
interpreter = tf.lite.Interpreter(model_path=r"model_breath_logspec_mfcc_cnn.tflite")
interpreter.allocate_tensors()
input_details, output_details = interpreter.get_input_details(), interpreter.get_output_details()

def convert_mp3_to_wav(mp3_path):
    audio = AudioSegment.from_mp3(mp3_path)
    wav_path = mp3_path.replace(".mp3", ".wav")
    audio.export(wav_path, format="wav")
    return wav_path

def extract_breath_features(y, sr):
    frame_length = int(window_length * sr)
    hop_length_samples = int(hop_length * sr)
    zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=frame_length, hop_length=hop_length_samples)
    rmse = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length_samples)
    breaths = (zcr.flatten() > 0.1) & (rmse.flatten() > 0.1)
    return np.where(breaths, 1, 0)

def extract_features(y, sr, n_mels=128, n_mfcc=13):
    try:
     
        # Extract MFCC & Log-Mel Spectrogram
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        logspec = librosa.power_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels))
        breath_feature = extract_breath_features(y, sr)

        # Dynamically adjust length based on the shortest feature
        min_len = min(mfcc.shape[1], logspec.shape[1], len(breath_feature))

        # Resize instead of fix_length for better visualization
        mfcc = librosa.util.fix_length(mfcc, size=min_len, axis=1)
        logspec = librosa.util.fix_length(logspec, size=min_len, axis=1)
        breath_feature = librosa.util.fix_length(breath_feature, size=min_len)

        return np.vstack((mfcc, logspec, breath_feature))
    except Exception as e:
        st.error(f"Error processing")
        return None
def prepare_single_data(features, max_len=500):
    features = librosa.util.fix_length(features, size=max_len, axis=1)
    return features[np.newaxis, ..., np.newaxis].astype(np.float32)

def predict_audio(features):
    global inference_time
    start_time=time.time()
    prepared_features = prepare_single_data(features)
    interpreter.set_tensor(input_details[0]['index'], prepared_features)
    interpreter.invoke()
    prediction = interpreter.get_tensor(output_details[0]['index'])
    end_time = time.time()
    inference_time=start_time-end_time
    return np.argmax(prediction, axis=1)[0], prediction[0]

def plot_waveform(y, sr, start_time, end_time):
    start_sample, end_sample = int(start_time * sr), int(end_time * sr)
    y_trimmed = y[start_sample:end_sample]
    times = np.linspace(start_time, end_time, num=len(y_trimmed))
    df = pd.DataFrame({"Time (s)": times, "Amplitude": y_trimmed})
    st.plotly_chart(px.line(df, x="Time (s)", y="Amplitude", title="Waveform"), use_container_width=True)
    return y_trimmed, sr

def visualize_features(features, duration):
    time_axis = np.linspace(0, duration, features.shape[1])
    df_breath = pd.DataFrame({"Time (s)": time_axis, "Breath Feature": features[-1]})
    st.plotly_chart(px.line(df_breath, x="Time (s)", y="Breath Feature", title="Breath Feature Over Time"), use_container_width=True)

def plot_mfcc_and_logspec(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    logspec = librosa.amplitude_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128))
    #logspec = librosa.power_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128))
    with st.sidebar:
        st.write("### MFCC and Log-Spectrogram")

        # Plot MFCC
        fig, ax = plt.subplots(figsize=(5, 3))
        librosa.display.specshow(mfcc, sr=sr, x_axis='time')
        plt.colorbar()
        plt.title("MFCC")
        st.pyplot(fig)

        # Plot Log-Spectrogram
        fig, ax = plt.subplots(figsize=(5, 3))
        librosa.display.specshow(logspec, sr=sr, x_axis='time')
        plt.colorbar()
        plt.title("Log-Mel Spectrogram")
        st.pyplot(fig)

st.title('Audio Integrity Net')
st.subheader('Documentation to be added')
uploaded_file = st.file_uploader('Upload an audio file', type=['wav', 'mp3'])

if uploaded_file:
    with open('temp_audio.wav', 'wb') as f:
        f.write(uploaded_file.getbuffer())
    
    y, sr = librosa.load('temp_audio.wav', sr=sample_rate)
    duration = librosa.get_duration(y=y, sr=sr)
    start_time, end_time = st.slider("Select time range", 0.0, duration, (0.0, duration))
    y_trimmed, sr = plot_waveform(y, sr, start_time, end_time)
    
    if st.sidebar.button("Show MFCC & Log-Spectrogram"):
        plot_mfcc_and_logspec(y_trimmed, sr)
    
    features = extract_features(y_trimmed, sr)
    if features is not None:
        st.success("Feature Extraction Completed!")
        visualize_features(features, end_time - start_time)
        prediction, probability = predict_audio(features)
        if prediction==0:
            st.subheader(f' Predicted class is Real ')
        else:
            st.subheader(f'Predicted class is Fake')
        st.write(f'Probability of being real: {probability[0] * 100:.2f}%')
        st.write(f'Probability of being fake: {probability[1] * 100:.2f}%')
        inference_time=abs(inference_time)
        st.write(f"Inference Time: {inference_time:.6f} seconds")