import os import numpy as np import librosa import sounddevice as sd import soundfile as sf from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer import torch # Load Hugging Face Wav2Vec2 Model model_name = "facebook/wav2vec2-large-xlsr-53" tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) def extract_features(audio, sample_rate=16000): mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40) # Extract MFCCs mfccs_scaled = np.mean(mfccs.T, axis=0) # Scale the MFCCs return mfccs_scaled # Function to predict emotion based on audio input def predict_emotion(audio): # Extract features from audio features = extract_features(audio).reshape(1, -1) # Reshape for classifier input predicted_emotion = model_rf.predict(features) return predicted_emotion[0] # Prepare your emotion classification model # Replace this section with your own training procedures as necessary # Assume we have trained 'model_rf', for example purposes # Here you can load a trained model or define how to train it emotions = ['happy', 'sad', 'angry', 'fear', 'surprise'] # Example emotion categories # For demonstration purposes, we are creating a dummy classifier. # Replace this with the actual model training as demonstrated previously. model_rf = RandomForestClassifier(n_estimators=100, random_state=42) # Dummy model for demo # This is a placeholder training step; you would train your model on actual data. features_dummy = np.random.rand(100, 40) # Dummy feature data labels_dummy = np.random.choice(emotions, 100) # Random dummy labels model_rf.fit(features_dummy, labels_dummy) # Dummy fit # Function to record audio and analyze emotion def record_and_predict(): print("Recording... Please speak with emotion...") duration = 5 # Duration of recording in seconds sample_rate = 16000 # Sample rate for audio recording # Record audio audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32') sd.wait() # Wait until recording is finished print("Recording finished.") # Predict emotion from the recorded audio emotion = predict_emotion(audio.flatten()) print(f'Predicted Emotion: {emotion}') if __name__ == "__main__": record_and_predict()