from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
import librosa
import torch

# Load the feature extractor and model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")
model = Wav2Vec2ForSequenceClassification.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")
model.eval()

def predict_emotion(audio_path):
    # Load audio (mono, 16kHz)
    audio, rate = librosa.load(audio_path, sr=16000)
    
    # Extract features
    inputs = feature_extractor(audio, sampling_rate=rate, return_tensors="pt", padding=True)
    
    # Predict emotion
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred_id = torch.argmax(probs, dim=-1).item()
        emotion = model.config.id2label[pred_id]
    
    return emotion

# # Example usage
# emotion = predict_emotion(r"D:\Intern\shankh\audio_samples\anga.wav")
# print(f"Predicted Emotion: {emotion}")