Spaces:
Sleeping
Sleeping
# Importing the required libraries | |
import streamlit as st | |
import numpy as np | |
import librosa | |
import matplotlib.pyplot as plt | |
from tensorflow.keras.models import Sequential, load_model | |
from tensorflow.keras.layers import Dense, Dropout | |
import io | |
import soundfile as sf | |
from st_audiorec import st_audiorec | |
from scipy.io.wavfile import write, read as wav_read | |
# Define the target speakers in a list. | |
# ALSO THESE ARE FILES THAT WERE NOT USED TO TEST NOR TRAIN THE MODEL | |
# I PULLED THEM OUT JUST SO WE CAN SEE HOW WELL THE MODEL PERFORMS ON | |
# UNSEEN AUDIO FILES. | |
# | |
# There is 4 more for each that still havent been added | |
# here but i already tested it in the notebook. 98% Accuracy | |
target_dictionary = { | |
0 : ["p225", 'AUDIO_FILES/p225_358.wav'], # Label, Speaker_id, Wav file | |
1 : ["p226", 'AUDIO_FILES/p226_366.wav'], | |
2 : ["p228", 'AUDIO_FILES/p228_367.wav'], | |
3 : ["p236", 'AUDIO_FILES/p236_500.wav'], | |
4 : ["p237", 'AUDIO_FILES/p237_348.wav'], | |
5 : ["p241", 'AUDIO_FILES/p241_370.wav'], | |
6 : ["p249", 'AUDIO_FILES/p249_351.wav'], | |
7 : ["p257", 'AUDIO_FILES/p257_430.wav'], | |
8 : ["p304", 'AUDIO_FILES/p304_420.wav'], | |
9 : ["p326", 'AUDIO_FILES/p326_400.wav'] | |
} | |
# Function to extract features from audio file... (same function from notebook) | |
def extract_feature(file_name): | |
""" Extract features from audio file | |
Args: | |
file_name (str): Path to audio file | |
return: | |
np.array: Feature vector | |
""" | |
X, sample_rate = librosa.core.load(file_name) # load audio file | |
result = np.array([]) # array that stores features | |
mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0) # calc mel spectogram | |
result = np.hstack((result, mel)) # insert the mel spect into results arr | |
return result # return the feature vector | |
# Function to classify gender (NOT MY CODE) | |
################################################### | |
# shout out: https://github.com/https://github.com/JoyBis48 | |
# Link to Hugging Face Space: https://huggingface.co/spaces/Cosmos48/Gender-Voice-Recognition | |
# Function to convert audio to spectrogram image. Just so u can see it 2. | |
def audio_to_spectrogram(file_path): | |
y, sr = librosa.load(file_path) | |
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, hop_length=512) | |
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) | |
plt.figure(figsize=(4, 4)) | |
plt.axis('off') | |
plt.imshow(mel_spec_db, aspect='auto', origin='lower') | |
plt.tight_layout() | |
plt.savefig("spectrogram.png") | |
plt.close() | |
def classify_gender(file_path): | |
features = extract_feature(file_path).reshape(1, -1) | |
male_prob = gender_model.predict(features, verbose=0)[0][0] | |
female_prob = 1 - male_prob | |
gender = "male" if male_prob > female_prob else "female" | |
probability = "{:.2f}".format(male_prob) if gender == "male" else "{:.2f}".format(female_prob) | |
return gender, probability | |
# Function to create the gender classification model | |
def create_model(vector_length=128): | |
model = Sequential([ | |
Dense(256, input_shape=(vector_length,), activation='relu'), | |
Dropout(0.3), | |
Dense(256, activation='relu'), | |
Dropout(0.3), | |
Dense(128, activation='relu'), | |
Dropout(0.3), | |
Dense(128, activation='relu'), | |
Dropout(0.3), | |
Dense(64, activation='relu'), | |
Dropout(0.3), | |
Dense(1, activation='sigmoid') | |
]) | |
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam') | |
return model | |
# Load the pre-trained model | |
gender_model = create_model() | |
# The saved_model.h5 is the pretrained model that was used. | |
gender_model.load_weights("NEW_MODELS/saved_model.h5") | |
#################################################### | |
# Function to classify the speaker | |
def classify_speaker(file_path): | |
# Extract features from the user recording | |
features = extract_feature(file_path).reshape(1, -1) # Reshaping to match the model input | |
# Predict the probabilities for each of the 10 speakers | |
speaker_probs = model.predict(features, verbose=0)[0] | |
# Identify the most likely speaker by finding the index of the highest probability | |
most_likely_speaker = np.argmax(speaker_probs) | |
probability = speaker_probs[most_likely_speaker] # Probability of the most likely speaker | |
# Map the index to the speaker label | |
speaker = f"Speaker {target_dictionary[most_likely_speaker][0]}" | |
# For users to hear what the voice sounds like if they use their actual voice. | |
wav_file = target_dictionary[most_likely_speaker][1] | |
# Format the probability for better readability | |
probability = "{:.2f}".format(probability) | |
return speaker, probability, wav_file | |
# Load Speaker Reco Model | |
model = load_model('NEW_MODELS/CUR_speaker_model.h5') | |
# Streamlit app | |
st.title("Voice Correlation Recognition") | |
st.write("This application is still undergoing fixes & updates ;-;") | |
# Option to upload a file | |
uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3']) | |
if uploaded_file is not None: | |
with open("uploaded_audio.wav", "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.audio(uploaded_file, format='audio/wav') | |
if st.button("Submit"): | |
try: | |
audio_to_spectrogram("uploaded_audio.wav") | |
st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True) | |
speaker, probability, _ = classify_speaker("uploaded_audio.wav") | |
gender, gen_probability = classify_gender("uploaded_audio.wav") | |
# What's the gender of speaker? | |
st.write(f"Predicted Gender: {gender}") | |
# What's the shot of speaker being a male or female | |
st.write(f"Gender Probability: {gen_probability}") | |
# Which speaker is it? | |
st.write(f"Predicted Speaker: {speaker}") | |
# What's the chances of being the speaker? | |
st.write(f"Speaker Probability: {probability}") | |
except Exception as e: | |
st.error(f"Error occurred: {e}") | |
# Record audio with streamlit_audio_recorder | |
recorded_audio = st_audiorec() | |
if recorded_audio: | |
# Save the audio as a .wav file | |
with open("recorded_audio.wav", "wb") as f: | |
f.write(recorded_audio) | |
st.write(f"Audio recorded and saved to recorded_audio.wav") # Show message | |
st.audio("recorded_audio.wav") # Show the audio file | |
# Process the recorded audio | |
audio_to_spectrogram("recorded_audio.wav") | |
st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True) | |
# Classify the speaker and gender | |
speaker, probability, wav_file = classify_speaker("recorded_audio.wav") | |
gender, gen_probability = classify_gender("recorded_audio.wav") | |
# Display results | |
st.write(f"Predicted Gender: {gender}") | |
st.write(f"Gender Probability: {gen_probability}") | |
st.write(f"Predicted Speaker: {speaker}") | |
st.write(f"Speaker Probability: {probability}") | |
# Display the wav file of the predicted speaker | |
st.audio(wav_file) | |