Spaces:

Dpngtm
/

Audio-Emotion-Recognition

Running

App Files Files Community

Dpngtm commited on 28 days ago

Commit

87f6c9c

•

1 Parent(s): ebb590a

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -36

app.py CHANGED Viewed

@@ -1,70 +1,142 @@
 import gradio as gr
 import torch
 from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
 import torchaudio
-# Define emotion labels (use the same order as during training)
 emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
 # Load model and processor
-model_name = "Dpngtm/wav2vec2-emotion-recognition"  # Replace with your model's Hugging Face Hub path
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
 processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels))
-# Define device (use GPU if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
-# Preprocessing and inference function
 def recognize_emotion(audio):
     """
-    Predicts the emotion from an audio file using the fine-tuned Wav2Vec2 model.
-    Args:
-        audio (str or file-like object): Path or file-like object for the audio file to predict emotion for.
-    Returns:
-        str: Predicted emotion label for the given audio file.
     """
     try:
-        # Determine if input is a file path or file-like object
         audio_path = audio if isinstance(audio, str) else audio.name
-        print(f'Received audio file:', audio_path)
-        # Load and resample audio to 16kHz if necessary
         speech_array, sampling_rate = torchaudio.load(audio_path)
-        print(f'Loaded audio with sampling rate:', sampling_rate)
         if sampling_rate != 16000:
             resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
-            speech_array = resampler(speech_array).squeeze().numpy()
-        else:
-            speech_array = speech_array.squeeze().numpy()
-        # Process input for the model
-        inputs = processor(speech_array, sampling_rate=16000, return_tensors='pt', padding=True)
         input_values = inputs.input_values.to(device)
-        # Make predictions
         with torch.no_grad():
-            logits = model(input_values).logits
-        predicted_label = torch.argmax(logits, dim=1).item()
-        # Map prediction to emotion label
-        emotion = emotion_labels[predicted_label]
-        return emotion
     except Exception as e:
-        return f'Error during prediction: {str(e)}'
-# Gradio interface with both microphone and file upload options
 interface = gr.Interface(
     fn=recognize_emotion,
-    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
-    outputs="text",
-    title="Emotion Recognition with Wav2Vec2",
-    description="Upload an audio file or record audio, and the model will predict the emotion."
-)
 # Launch the app
-interface.launch()

 import gradio as gr
 import torch
+import torch.nn.functional as F
 from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
 import torchaudio
+import numpy as np
+# Define emotion labels
 emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
 # Load model and processor
+model_name = "Dpngtm/wav2vec2-emotion-recognition"
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
 processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels))
+# Define device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+model.eval()  # Set model to evaluation mode
 def recognize_emotion(audio):
     """
+    Predicts the emotion and confidence scores from an audio file.
+    Max duration: 60 seconds
     """
     try:
+        if audio is None:
+            return {emotion: 0.0 for emotion in emotion_labels}
+        # Handle audio input
         audio_path = audio if isinstance(audio, str) else audio.name
+        # Load and resample audio
         speech_array, sampling_rate = torchaudio.load(audio_path)
+        # Check audio duration
+        duration = speech_array.shape[1] / sampling_rate
+        if duration > 60:  # 60 seconds (1 minute) limit
+            return {
+                "Error": "Audio too long (max 1 minute)",
+                **{emotion: 0.0 for emotion in emotion_labels}
+            }
+        # Resample if needed
         if sampling_rate != 16000:
             resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
+            speech_array = resampler(speech_array)
+        # Convert to mono if stereo
+        if speech_array.shape[0] > 1:
+            speech_array = torch.mean(speech_array, dim=0, keepdim=True)
+        # Normalize audio
+        speech_array = speech_array / torch.max(torch.abs(speech_array))
+        # Convert to numpy and squeeze
+        speech_array = speech_array.squeeze().numpy()
+        # Process input
+        inputs = processor(
+            speech_array,
+            sampling_rate=16000,
+            return_tensors='pt',
+            padding=True
+        )
         input_values = inputs.input_values.to(device)
+        # Get predictions
         with torch.no_grad():
+            outputs = model(input_values)
+            logits = outputs.logits
+            # Get probabilities using softmax
+            probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
+            # Get confidence scores for all emotions
+            confidence_scores = {
+                emotion: round(float(prob) * 100, 2) # Convert to percentage with 2 decimal places
+                for emotion, prob in zip(emotion_labels, probs)
+            }
+            # Sort confidence scores by value
+            sorted_scores = dict(sorted(
+                confidence_scores.items(),
+                key=lambda x: x[1],
+                reverse=True
+            ))
+            return sorted_scores
     except Exception as e:
+        return {
+            "Error": str(e),
+            **{emotion: 0.0 for emotion in emotion_labels}
+        }
+# Create Gradio interface
 interface = gr.Interface(
     fn=recognize_emotion,
+    inputs=gr.Audio(
+        sources=["microphone", "upload"],
+        type="filepath",
+        label="Upload audio or record from microphone",
+        max_length=60  # Set max length to 60 seconds in Gradio interface
+    ),
+    outputs=gr.Label(
+        num_top_classes=len(emotion_labels),
+        label="Emotion Predictions"
+    ),
+    title="Speech Emotion Recognition",
+    description="""
+    ## Speech Emotion Recognition using Wav2Vec2
+    This model recognizes emotions from speech audio in the following categories:
+    - Angry 😠
+    - Calm 😌
+    - Disgust 🤢
+    - Fearful 😨
+    - Happy 😊
+    - Neutral 😐
+    - Sad 😢
+    - Surprised 😲
+    ### Instructions:
+    1. Upload an audio file or record through the microphone
+    2. Wait for processing
+    3. View predicted emotions with confidence scores
+    ### Notes:
+    - Maximum audio length: 1 minute
+    - Best results with clear speech and minimal background noise
+    - Confidence scores are shown as percentages
+    """,
 # Launch the app
+interface.launch(
+    share=True,
+    debug=True,
+    server_name="0.0.0.0",
+    server_port=7860
+)