import streamlit as st import torch import librosa import numpy as np from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification import torchaudio # Emojis for emotions EMOTION_EMOJI = { "angry": "😠", "happy": "😄", "sad": "😢", "neutral": "😐" } # Load processor and model processor = Wav2Vec2Processor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition") model = Wav2Vec2ForSequenceClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition") # Title st.title("🎙️ Voice Emotion Detector with Emoji") # Upload audio uploaded_file = st.file_uploader("Upload a WAV file", type=["wav"]) if uploaded_file is not None: st.audio(uploaded_file, format="audio/wav") # Load and preprocess audio speech_array, sampling_rate = torchaudio.load(uploaded_file) if sampling_rate != 16000: speech_array = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(speech_array) speech = speech_array.squeeze().numpy() inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(**inputs).logits predicted_class_id = torch.argmax(logits).item() emotion = model.config.id2label[predicted_class_id] st.markdown(f"### Emotion Detected: **{emotion}** {EMOTION_EMOJI.get(emotion, '')}")