Gregory2041's picture
Update app.py
8984d2a verified
# Importing the required libraries
import streamlit as st
import numpy as np
import librosa
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
import io
import soundfile as sf
from st_audiorec import st_audiorec
from scipy.io.wavfile import write, read as wav_read
# Define the target speakers in a list.
# ALSO THESE ARE FILES THAT WERE NOT USED TO TEST NOR TRAIN THE MODEL
# I PULLED THEM OUT JUST SO WE CAN SEE HOW WELL THE MODEL PERFORMS ON
# UNSEEN AUDIO FILES.
#
# There is 4 more for each that still havent been added
# here but i already tested it in the notebook. 98% Accuracy
target_dictionary = {
0 : ["p225", 'AUDIO_FILES/p225_358.wav'], # Label, Speaker_id, Wav file
1 : ["p226", 'AUDIO_FILES/p226_366.wav'],
2 : ["p228", 'AUDIO_FILES/p228_367.wav'],
3 : ["p236", 'AUDIO_FILES/p236_500.wav'],
4 : ["p237", 'AUDIO_FILES/p237_348.wav'],
5 : ["p241", 'AUDIO_FILES/p241_370.wav'],
6 : ["p249", 'AUDIO_FILES/p249_351.wav'],
7 : ["p257", 'AUDIO_FILES/p257_430.wav'],
8 : ["p304", 'AUDIO_FILES/p304_420.wav'],
9 : ["p326", 'AUDIO_FILES/p326_400.wav']
}
# Function to extract features from audio file... (same function from notebook)
def extract_feature(file_name):
""" Extract features from audio file
Args:
file_name (str): Path to audio file
return:
np.array: Feature vector
"""
X, sample_rate = librosa.core.load(file_name) # load audio file
result = np.array([]) # array that stores features
mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0) # calc mel spectogram
result = np.hstack((result, mel)) # insert the mel spect into results arr
return result # return the feature vector
# Function to classify gender (NOT MY CODE)
###################################################
# shout out: https://github.com/https://github.com/JoyBis48
# Link to Hugging Face Space: https://huggingface.co/spaces/Cosmos48/Gender-Voice-Recognition
# Function to convert audio to spectrogram image. Just so u can see it 2.
def audio_to_spectrogram(file_path):
y, sr = librosa.load(file_path)
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, hop_length=512)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
plt.figure(figsize=(4, 4))
plt.axis('off')
plt.imshow(mel_spec_db, aspect='auto', origin='lower')
plt.tight_layout()
plt.savefig("spectrogram.png")
plt.close()
def classify_gender(file_path):
features = extract_feature(file_path).reshape(1, -1)
male_prob = gender_model.predict(features, verbose=0)[0][0]
female_prob = 1 - male_prob
gender = "male" if male_prob > female_prob else "female"
probability = "{:.2f}".format(male_prob) if gender == "male" else "{:.2f}".format(female_prob)
return gender, probability
# Function to create the gender classification model
def create_model(vector_length=128):
model = Sequential([
Dense(256, input_shape=(vector_length,), activation='relu'),
Dropout(0.3),
Dense(256, activation='relu'),
Dropout(0.3),
Dense(128, activation='relu'),
Dropout(0.3),
Dense(128, activation='relu'),
Dropout(0.3),
Dense(64, activation='relu'),
Dropout(0.3),
Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
return model
# Load the pre-trained model
gender_model = create_model()
# The saved_model.h5 is the pretrained model that was used.
gender_model.load_weights("NEW_MODELS/saved_model.h5")
####################################################
# Function to classify the speaker
def classify_speaker(file_path):
# Extract features from the user recording
features = extract_feature(file_path).reshape(1, -1) # Reshaping to match the model input
# Predict the probabilities for each of the 10 speakers
speaker_probs = model.predict(features, verbose=0)[0]
# Identify the most likely speaker by finding the index of the highest probability
most_likely_speaker = np.argmax(speaker_probs)
probability = speaker_probs[most_likely_speaker] # Probability of the most likely speaker
# Map the index to the speaker label
speaker = f"Speaker {target_dictionary[most_likely_speaker][0]}"
# For users to hear what the voice sounds like if they use their actual voice.
wav_file = target_dictionary[most_likely_speaker][1]
# Format the probability for better readability
probability = "{:.2f}".format(probability)
return speaker, probability, wav_file
# Load Speaker Reco Model
model = load_model('NEW_MODELS/CUR_speaker_model.h5')
# Streamlit app
st.title("Voice Correlation Recognition")
st.write("This application is still undergoing fixes & updates ;-;")
# Option to upload a file
uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
if uploaded_file is not None:
with open("uploaded_audio.wav", "wb") as f:
f.write(uploaded_file.getbuffer())
st.audio(uploaded_file, format='audio/wav')
if st.button("Submit"):
try:
audio_to_spectrogram("uploaded_audio.wav")
st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True)
speaker, probability, _ = classify_speaker("uploaded_audio.wav")
gender, gen_probability = classify_gender("uploaded_audio.wav")
# What's the gender of speaker?
st.write(f"Predicted Gender: {gender}")
# What's the shot of speaker being a male or female
st.write(f"Gender Probability: {gen_probability}")
# Which speaker is it?
st.write(f"Predicted Speaker: {speaker}")
# What's the chances of being the speaker?
st.write(f"Speaker Probability: {probability}")
except Exception as e:
st.error(f"Error occurred: {e}")
# Record audio with streamlit_audio_recorder
recorded_audio = st_audiorec()
if recorded_audio:
# Save the audio as a .wav file
with open("recorded_audio.wav", "wb") as f:
f.write(recorded_audio)
st.write(f"Audio recorded and saved to recorded_audio.wav") # Show message
st.audio("recorded_audio.wav") # Show the audio file
# Process the recorded audio
audio_to_spectrogram("recorded_audio.wav")
st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True)
# Classify the speaker and gender
speaker, probability, wav_file = classify_speaker("recorded_audio.wav")
gender, gen_probability = classify_gender("recorded_audio.wav")
# Display results
st.write(f"Predicted Gender: {gender}")
st.write(f"Gender Probability: {gen_probability}")
st.write(f"Predicted Speaker: {speaker}")
st.write(f"Speaker Probability: {probability}")
# Display the wav file of the predicted speaker
st.audio(wav_file)