Spaces:
Sleeping
Sleeping
| # Importing the required libraries | |
| import streamlit as st | |
| import numpy as np | |
| import librosa | |
| import matplotlib.pyplot as plt | |
| from tensorflow.keras.models import Sequential, load_model | |
| from tensorflow.keras.layers import Dense, Dropout | |
| import io | |
| import soundfile as sf | |
| from st_audiorec import st_audiorec | |
| from scipy.io.wavfile import write, read as wav_read | |
| # Define the target speakers in a list. | |
| # ALSO THESE ARE FILES THAT WERE NOT USED TO TEST NOR TRAIN THE MODEL | |
| # I PULLED THEM OUT JUST SO WE CAN SEE HOW WELL THE MODEL PERFORMS ON | |
| # UNSEEN AUDIO FILES. | |
| # | |
| # There is 4 more for each that still havent been added | |
| # here but i already tested it in the notebook. 98% Accuracy | |
| target_dictionary = { | |
| 0 : ["p225", 'AUDIO_FILES/p225_358.wav'], # Label, Speaker_id, Wav file | |
| 1 : ["p226", 'AUDIO_FILES/p226_366.wav'], | |
| 2 : ["p228", 'AUDIO_FILES/p228_367.wav'], | |
| 3 : ["p236", 'AUDIO_FILES/p236_500.wav'], | |
| 4 : ["p237", 'AUDIO_FILES/p237_348.wav'], | |
| 5 : ["p241", 'AUDIO_FILES/p241_370.wav'], | |
| 6 : ["p249", 'AUDIO_FILES/p249_351.wav'], | |
| 7 : ["p257", 'AUDIO_FILES/p257_430.wav'], | |
| 8 : ["p304", 'AUDIO_FILES/p304_420.wav'], | |
| 9 : ["p326", 'AUDIO_FILES/p326_400.wav'] | |
| } | |
| # Function to extract features from audio file... (same function from notebook) | |
| def extract_feature(file_name): | |
| """ Extract features from audio file | |
| Args: | |
| file_name (str): Path to audio file | |
| return: | |
| np.array: Feature vector | |
| """ | |
| X, sample_rate = librosa.core.load(file_name) # load audio file | |
| result = np.array([]) # array that stores features | |
| mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0) # calc mel spectogram | |
| result = np.hstack((result, mel)) # insert the mel spect into results arr | |
| return result # return the feature vector | |
| # Function to classify gender (NOT MY CODE) | |
| ################################################### | |
| # shout out: https://github.com/https://github.com/JoyBis48 | |
| # Link to Hugging Face Space: https://huggingface.co/spaces/Cosmos48/Gender-Voice-Recognition | |
| # Function to convert audio to spectrogram image. Just so u can see it 2. | |
| def audio_to_spectrogram(file_path): | |
| y, sr = librosa.load(file_path) | |
| mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, hop_length=512) | |
| mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) | |
| plt.figure(figsize=(4, 4)) | |
| plt.axis('off') | |
| plt.imshow(mel_spec_db, aspect='auto', origin='lower') | |
| plt.tight_layout() | |
| plt.savefig("spectrogram.png") | |
| plt.close() | |
| def classify_gender(file_path): | |
| features = extract_feature(file_path).reshape(1, -1) | |
| male_prob = gender_model.predict(features, verbose=0)[0][0] | |
| female_prob = 1 - male_prob | |
| gender = "male" if male_prob > female_prob else "female" | |
| probability = "{:.2f}".format(male_prob) if gender == "male" else "{:.2f}".format(female_prob) | |
| return gender, probability | |
| # Function to create the gender classification model | |
| def create_model(vector_length=128): | |
| model = Sequential([ | |
| Dense(256, input_shape=(vector_length,), activation='relu'), | |
| Dropout(0.3), | |
| Dense(256, activation='relu'), | |
| Dropout(0.3), | |
| Dense(128, activation='relu'), | |
| Dropout(0.3), | |
| Dense(128, activation='relu'), | |
| Dropout(0.3), | |
| Dense(64, activation='relu'), | |
| Dropout(0.3), | |
| Dense(1, activation='sigmoid') | |
| ]) | |
| model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam') | |
| return model | |
| # Load the pre-trained model | |
| gender_model = create_model() | |
| # The saved_model.h5 is the pretrained model that was used. | |
| gender_model.load_weights("NEW_MODELS/saved_model.h5") | |
| #################################################### | |
| # Function to classify the speaker | |
| def classify_speaker(file_path): | |
| # Extract features from the user recording | |
| features = extract_feature(file_path).reshape(1, -1) # Reshaping to match the model input | |
| # Predict the probabilities for each of the 10 speakers | |
| speaker_probs = model.predict(features, verbose=0)[0] | |
| # Identify the most likely speaker by finding the index of the highest probability | |
| most_likely_speaker = np.argmax(speaker_probs) | |
| probability = speaker_probs[most_likely_speaker] # Probability of the most likely speaker | |
| # Map the index to the speaker label | |
| speaker = f"Speaker {target_dictionary[most_likely_speaker][0]}" | |
| # For users to hear what the voice sounds like if they use their actual voice. | |
| wav_file = target_dictionary[most_likely_speaker][1] | |
| # Format the probability for better readability | |
| probability = "{:.2f}".format(probability) | |
| return speaker, probability, wav_file | |
| # Load Speaker Reco Model | |
| model = load_model('NEW_MODELS/CUR_speaker_model.h5') | |
| # Streamlit app | |
| st.title("Voice Correlation Recognition") | |
| st.write("This application is still undergoing fixes & updates ;-;") | |
| # Option to upload a file | |
| uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3']) | |
| if uploaded_file is not None: | |
| with open("uploaded_audio.wav", "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| st.audio(uploaded_file, format='audio/wav') | |
| if st.button("Submit"): | |
| try: | |
| audio_to_spectrogram("uploaded_audio.wav") | |
| st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True) | |
| speaker, probability, _ = classify_speaker("uploaded_audio.wav") | |
| gender, gen_probability = classify_gender("uploaded_audio.wav") | |
| # What's the gender of speaker? | |
| st.write(f"Predicted Gender: {gender}") | |
| # What's the shot of speaker being a male or female | |
| st.write(f"Gender Probability: {gen_probability}") | |
| # Which speaker is it? | |
| st.write(f"Predicted Speaker: {speaker}") | |
| # What's the chances of being the speaker? | |
| st.write(f"Speaker Probability: {probability}") | |
| except Exception as e: | |
| st.error(f"Error occurred: {e}") | |
| # Record audio with streamlit_audio_recorder | |
| recorded_audio = st_audiorec() | |
| if recorded_audio: | |
| # Save the audio as a .wav file | |
| with open("recorded_audio.wav", "wb") as f: | |
| f.write(recorded_audio) | |
| st.write(f"Audio recorded and saved to recorded_audio.wav") # Show message | |
| st.audio("recorded_audio.wav") # Show the audio file | |
| # Process the recorded audio | |
| audio_to_spectrogram("recorded_audio.wav") | |
| st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True) | |
| # Classify the speaker and gender | |
| speaker, probability, wav_file = classify_speaker("recorded_audio.wav") | |
| gender, gen_probability = classify_gender("recorded_audio.wav") | |
| # Display results | |
| st.write(f"Predicted Gender: {gender}") | |
| st.write(f"Gender Probability: {gen_probability}") | |
| st.write(f"Predicted Speaker: {speaker}") | |
| st.write(f"Speaker Probability: {probability}") | |
| # Display the wav file of the predicted speaker | |
| st.audio(wav_file) | |