from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification import librosa import torch # Load the feature extractor and model feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition") model = Wav2Vec2ForSequenceClassification.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition") model.eval() def predict_emotion(audio_path): # Load audio (mono, 16kHz) audio, rate = librosa.load(audio_path, sr=16000) # Extract features inputs = feature_extractor(audio, sampling_rate=rate, return_tensors="pt", padding=True) # Predict emotion with torch.no_grad(): outputs = model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=-1) pred_id = torch.argmax(probs, dim=-1).item() emotion = model.config.id2label[pred_id] return emotion # # Example usage # emotion = predict_emotion(r"D:\Intern\shankh\audio_samples\anga.wav") # print(f"Predicted Emotion: {emotion}")