import gradio as gr
import torch
import torch.nn.functional as F
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torchaudio
import numpy as np

# Define emotion labels
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]

# Load model and processor
model_name = "Dpngtm/wav2vec2-emotion-recognition"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels))

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()  # Set model to evaluation mode

def recognize_emotion(audio):
    """
    Predicts the emotion and confidence scores from an audio file.
    Max duration: 60 seconds
    """
    try:
        if audio is None:
            return {emotion: 0.0 for emotion in emotion_labels}
            
        # Handle audio input
        audio_path = audio if isinstance(audio, str) else audio.name
        
        # Load and resample audio
        speech_array, sampling_rate = torchaudio.load(audio_path)
        
        # Check audio duration
        duration = speech_array.shape[1] / sampling_rate
        if duration > 60:  # 60 seconds (1 minute) limit
            return {
                "Error": "Audio too long (max 1 minute)",
                **{emotion: 0.0 for emotion in emotion_labels}
            }
        
        # Resample if needed
        if sampling_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_array = resampler(speech_array)
        
        # Convert to mono if stereo
        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)
            
        # Normalize audio
        speech_array = speech_array / torch.max(torch.abs(speech_array))
        
        # Convert to numpy and squeeze
        speech_array = speech_array.squeeze().numpy()
        
        # Process input
        inputs = processor(
            speech_array, 
            sampling_rate=16000, 
            return_tensors='pt', 
            padding=True
        )
        input_values = inputs.input_values.to(device)
        
        # Get predictions
        with torch.no_grad():
            outputs = model(input_values)
            logits = outputs.logits
            
            # Get probabilities using softmax
            probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
            
            # Get confidence scores for all emotions
            confidence_scores = {
                emotion: round(float(prob) * 100, 2) # Convert to percentage with 2 decimal places
                for emotion, prob in zip(emotion_labels, probs)
            }
            
            # Sort confidence scores by value
            sorted_scores = dict(sorted(
                confidence_scores.items(), 
                key=lambda x: x[1], 
                reverse=True
            ))
            
            return sorted_scores
            
    except Exception as e:
        return {
            "Error": str(e),
            **{emotion: 0.0 for emotion in emotion_labels}
        }

# Create Gradio interface
interface = gr.Interface(
    fn=recognize_emotion,
    inputs=gr.Audio(
        sources=["microphone", "upload"], 
        type="filepath",
        label="Upload audio or record from microphone",
        max_length=60  # Set max length to 60 seconds in Gradio interface
    ),
    outputs=gr.Label(
        num_top_classes=len(emotion_labels),
        label="Emotion Predictions"
    ),
    title="Speech Emotion Recognition",
    description="""
    ## Speech Emotion Recognition using Wav2Vec2
    
    This model recognizes emotions from speech audio in the following categories:
    - Angry 😠
    - Calm 😌
    - Disgust 🤢
    - Fearful 😨
    - Happy 😊
    - Neutral 😐
    - Sad 😢
    - Surprised 😲
    
    ### Instructions:
    1. Upload an audio file or record through the microphone
    2. Wait for processing
    3. View predicted emotions with confidence scores
    
    ### Notes:
    - Maximum audio length: 1 minute
    - Best results with clear speech and minimal background noise
    - Confidence scores are shown as percentages
    """,


# Launch the app
interface.launch(
    share=True, 
    debug=True,
    server_name="0.0.0.0",
    server_port=7860
)