import os
import numpy as np
import librosa
import sounddevice as sd
import soundfile as sf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import torch

# Load Hugging Face Wav2Vec2 Model
model_name = "facebook/wav2vec2-large-xlsr-53"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

def extract_features(audio, sample_rate=16000):
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)  # Extract MFCCs
    mfccs_scaled = np.mean(mfccs.T, axis=0)  # Scale the MFCCs
    return mfccs_scaled

# Function to predict emotion based on audio input
def predict_emotion(audio):
    # Extract features from audio
    features = extract_features(audio).reshape(1, -1)  # Reshape for classifier input
    predicted_emotion = model_rf.predict(features)
    return predicted_emotion[0]

# Prepare your emotion classification model
# Replace this section with your own training procedures as necessary

# Assume we have trained 'model_rf', for example purposes
# Here you can load a trained model or define how to train it
emotions = ['happy', 'sad', 'angry', 'fear', 'surprise']  # Example emotion categories
# For demonstration purposes, we are creating a dummy classifier.
# Replace this with the actual model training as demonstrated previously.
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)  # Dummy model for demo

# This is a placeholder training step; you would train your model on actual data.
features_dummy = np.random.rand(100, 40)  # Dummy feature data
labels_dummy = np.random.choice(emotions, 100)  # Random dummy labels
model_rf.fit(features_dummy, labels_dummy)  # Dummy fit

# Function to record audio and analyze emotion
def record_and_predict():
    print("Recording... Please speak with emotion...")
    duration = 5  # Duration of recording in seconds
    sample_rate = 16000  # Sample rate for audio recording
    
    # Record audio
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
    sd.wait()  # Wait until recording is finished
    print("Recording finished.")

    # Predict emotion from the recorded audio
    emotion = predict_emotion(audio.flatten())
    print(f'Predicted Emotion: {emotion}')

if __name__ == "__main__":
    record_and_predict()