Spaces:

Gregory2041
/

SpeechRecognition

Sleeping

File size: 7,089 Bytes

# Importing the required libraries
import streamlit as st
import numpy as np
import librosa
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
import io 
import soundfile as sf
from st_audiorec import st_audiorec
from scipy.io.wavfile import write, read as wav_read

# Define the target speakers in a list.
# ALSO THESE ARE FILES THAT WERE NOT USED TO TEST NOR TRAIN THE MODEL
# I PULLED THEM OUT JUST SO WE CAN SEE HOW WELL THE MODEL PERFORMS ON 
# UNSEEN AUDIO FILES. 
# 
# There is 4 more for each that still havent been added
# here but i already tested it in the notebook. 98% Accuracy

target_dictionary = {
    0 : ["p225", 'AUDIO_FILES/p225_358.wav'], # Label, Speaker_id, Wav file
    1 : ["p226", 'AUDIO_FILES/p226_366.wav'], 
    2 : ["p228", 'AUDIO_FILES/p228_367.wav'], 
    3 : ["p236", 'AUDIO_FILES/p236_500.wav'], 
    4 : ["p237", 'AUDIO_FILES/p237_348.wav'], 
    5 : ["p241", 'AUDIO_FILES/p241_370.wav'], 
    6 : ["p249", 'AUDIO_FILES/p249_351.wav'], 
    7 : ["p257", 'AUDIO_FILES/p257_430.wav'], 
    8 : ["p304", 'AUDIO_FILES/p304_420.wav'], 
    9 : ["p326", 'AUDIO_FILES/p326_400.wav']
}

# Function to extract features from audio file... (same function from notebook)
def extract_feature(file_name):
    """ Extract features from audio file
    Args:
      file_name (str): Path to audio file
    return:
      np.array: Feature vector
    """
    X, sample_rate = librosa.core.load(file_name) # load audio file
    result = np.array([]) # array that stores features
    mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0) # calc mel spectogram
    result = np.hstack((result, mel)) # insert the mel spect into results arr
    return result # return the feature vector

# Function to classify gender (NOT MY CODE)

###################################################
# shout out: https://github.com/https://github.com/JoyBis48
# Link to Hugging Face Space: https://huggingface.co/spaces/Cosmos48/Gender-Voice-Recognition

# Function to convert audio to spectrogram image. Just so u can see it 2.
def audio_to_spectrogram(file_path):
    y, sr = librosa.load(file_path)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, hop_length=512)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    plt.figure(figsize=(4, 4))
    plt.axis('off')
    plt.imshow(mel_spec_db, aspect='auto', origin='lower')
    plt.tight_layout()
    plt.savefig("spectrogram.png")
    plt.close()
    
def classify_gender(file_path):
    features = extract_feature(file_path).reshape(1, -1)
    male_prob = gender_model.predict(features, verbose=0)[0][0]
    female_prob = 1 - male_prob
    gender = "male" if male_prob > female_prob else "female"
    probability = "{:.2f}".format(male_prob) if gender == "male" else "{:.2f}".format(female_prob)
    return gender, probability

# Function to create the gender classification model
def create_model(vector_length=128):
    model = Sequential([
    Dense(256, input_shape=(vector_length,), activation='relu'),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
    return model

# Load the pre-trained model
gender_model = create_model()

# The saved_model.h5 is the pretrained model that was used.
gender_model.load_weights("NEW_MODELS/saved_model.h5")

####################################################

# Function to classify the speaker
def classify_speaker(file_path):
    # Extract features from the user recording
    features = extract_feature(file_path).reshape(1, -1)  # Reshaping to match the model input

    # Predict the probabilities for each of the 10 speakers
    speaker_probs = model.predict(features, verbose=0)[0]

    # Identify the most likely speaker by finding the index of the highest probability
    most_likely_speaker = np.argmax(speaker_probs)  
    probability = speaker_probs[most_likely_speaker]  # Probability of the most likely speaker

    # Map the index to the speaker label
    speaker = f"Speaker {target_dictionary[most_likely_speaker][0]}"

    # For users to hear what the voice sounds like if they use their actual voice.
    wav_file = target_dictionary[most_likely_speaker][1]
    
    # Format the probability for better readability
    probability = "{:.2f}".format(probability)

    return speaker, probability, wav_file

# Load Speaker Reco Model
model = load_model('NEW_MODELS/CUR_speaker_model.h5')

# Streamlit app
st.title("Voice Correlation Recognition")
st.write("This application is still undergoing fixes & updates ;-;")

# Option to upload a file
uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])

if uploaded_file is not None:
    with open("uploaded_audio.wav", "wb") as f:
        f.write(uploaded_file.getbuffer())
    st.audio(uploaded_file, format='audio/wav')

    if st.button("Submit"):
        try:
            audio_to_spectrogram("uploaded_audio.wav")
            st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True)
            speaker, probability, _ = classify_speaker("uploaded_audio.wav")
            gender, gen_probability = classify_gender("uploaded_audio.wav")
            
            # What's the gender of speaker?
            st.write(f"Predicted Gender: {gender}")

            # What's the shot of speaker being a male or female
            st.write(f"Gender Probability: {gen_probability}")

            # Which speaker is it?
            st.write(f"Predicted Speaker: {speaker}")

            # What's the chances of being the speaker?
            st.write(f"Speaker Probability: {probability}")
            
        except Exception as e:
            st.error(f"Error occurred: {e}")

# Record audio with streamlit_audio_recorder
recorded_audio = st_audiorec()

if recorded_audio:
    # Save the audio as a .wav file
    with open("recorded_audio.wav", "wb") as f:
        f.write(recorded_audio)

    st.write(f"Audio recorded and saved to recorded_audio.wav")  # Show message
    st.audio("recorded_audio.wav")  # Show the audio file

    # Process the recorded audio
    audio_to_spectrogram("recorded_audio.wav")
    st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True)

    # Classify the speaker and gender
    speaker, probability, wav_file = classify_speaker("recorded_audio.wav")
    gender, gen_probability = classify_gender("recorded_audio.wav")
    
    # Display results
    st.write(f"Predicted Gender: {gender}")
    st.write(f"Gender Probability: {gen_probability}")
    st.write(f"Predicted Speaker: {speaker}")
    st.write(f"Speaker Probability: {probability}")
    
    # Display the wav file of the predicted speaker
    st.audio(wav_file)