import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import librosa
import time
from datetime import datetime
import pandas as pd
HOME_DIR = ""
local_config_path = 'config.json'
local_preprocessor_config_path = 'preprocessor_config.json'
local_weights_path = 'pytorch_model.bin'
local_training_args_path = 'training_args.bin'

import torch
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm

# Define the id2label mapping
id2label = {
    0: "angry",
    1: "disgust",
    2: "fear",
    3: "happy",
    4: "neutral",
    5: "sad",
    6: "surprise"
}


def predict(model, feature_extractor, data, max_length, id2label):
    # Extract features
    print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Extracting features...")
    inputs = feature_extractor(data, sampling_rate=16000, max_length=max_length, return_tensors='tf', padding=True, truncation=True)
    torch_inputs = torch.tensor(inputs['input_values'].numpy(), dtype=torch.float32)
    print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Predicting...")
    # Forward pass
    outputs = model(input_values=torch_inputs)

    # Extract logits from the output
    logits = outputs

    # Apply softmax to get probabilities
    probabilities = F.softmax(logits, dim=-1)

    # Get the predicted class index
    predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
    predicted_label = id2label[predicted_class_idx]
    #predicted_label = predicted_class_idx
    
    return predicted_label, probabilities

from transformers import Wav2Vec2Config, Wav2Vec2Model
import torch.nn as nn
from huggingface_hub import PyTorchModelHubMixin

config = Wav2Vec2Config.from_pretrained(local_config_path)
class Wav2Vec2ForSpeechClassification(nn.Module, PyTorchModelHubMixin):
    def __init__(self, config):
        super(Wav2Vec2ForSpeechClassification, self).__init__()
        self.wav2vec2 = Wav2Vec2Model(config)

        self.classifier = nn.ModuleDict({
            'dense': nn.Linear(config.hidden_size, config.hidden_size),
            'activation': nn.ReLU(),
            'dropout': nn.Dropout(config.final_dropout),
            'out_proj': nn.Linear(config.hidden_size, config.num_labels)
        })

    def forward(self, input_values):
        outputs = self.wav2vec2(input_values)
        hidden_states = outputs.last_hidden_state

        x = self.classifier['dense'](hidden_states[:, 0, :])
        x = self.classifier['activation'](x)
        x = self.classifier['dropout'](x)
        logits = self.classifier['out_proj'](x)

        return logits
    
import json
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor

# Load the preprocessor configuration from the local file
with open(local_preprocessor_config_path, 'r') as file:
    preprocessor_config = json.load(file)

# Initialize the preprocessor using the loaded configuration
feature_extractor = Wav2Vec2FeatureExtractor(
    do_normalize=preprocessor_config["do_normalize"],
    feature_extractor_type=preprocessor_config["feature_extractor_type"],
    feature_size=preprocessor_config["feature_size"],
    padding_side=preprocessor_config["padding_side"],
    padding_value=preprocessor_config["padding_value"],
    processor_class_from_name=preprocessor_config["processor_class"],
    return_attention_mask=preprocessor_config["return_attention_mask"],
    sampling_rate=preprocessor_config["sampling_rate"]
)

# load the newly finetuned model from huggingface repo

from huggingface_hub import hf_hub_download

model_path = hf_hub_download(
    repo_id="kvilla/wav2vec-english-speech-emotion-recognition-finetuned",
    filename="model_finetuned.pth"
)

# load the newly finetuned model! from local
saved_model = torch.load(model_path, map_location=torch.device('cpu'))

# Create the model with the loaded configuration
model = Wav2Vec2ForSpeechClassification(config=config)

# Load the state dictionary
model.load_state_dict(saved_model)

print("Model initialized successfully.")

model.eval()


def recognize_emotion(audio):
    # Load the audio file using librosa

    sample_rate, audio_data = audio

     # Ensure audio data is in floating-point format
    if not np.issubdtype(audio_data.dtype, np.floating):
        audio_data = audio_data.astype(np.float32)
    # If you still want to process it with librosa, e.g., to change sample rate:
    if sample_rate != 16000:
        print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), ":Resampling audio...")
        audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
    emotion, probabilities = predict(model, feature_extractor, audio_data, 48000, id2label) # limit to 3seconds
    print(probabilities)
    probs = probabilities.detach().numpy().flatten().tolist()
    print(probs)
    # Convert probabilities to percentages
    percentages = [round(prob * 100, 2) for prob in probs]
    print(percentages)
    # Define the class labels (adjust to match your specific model's class labels)
    labels = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprise"]
    print(labels)
    # Create a DataFrame
    df = pd.DataFrame({"Emotion": labels, "Probability (%)": percentages})
    df = df.sort_values(by="Probability (%)", ascending=False)
    print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), df)
    return emotion, get_emotion_image(emotion), df

def get_emotion_image(emotion):
    # Here, you would have a dictionary or logic to map emotions to images
    emotion_to_image = {
        "angry": "angry.jpeg",
        "disgust": "disgust.jpeg",
        "fear": "fear.jpeg",
        "happy": "happy.jpeg",
        "neutral": "neutral.jpeg",
        "sad": "sad.jpeg",
        "surprise": "surprise.jpeg"
        # Add other emotions and their corresponding images
    }
    
    # Default image if emotion is not found
    image_path = emotion_to_image.get(emotion, "default.jpg")
    # Load and return the image
    return Image.open(image_path)

demo = gr.Blocks()
with demo:
    df_logs = pd.DataFrame(columns=['Timestamp', 'Emotion'])
    theme= gr.themes.Soft(),
    audio_input = gr.Audio(type="numpy", 
                            sources=["microphone"], 
                            show_label=True,
                            streaming=True
                            )
    text_output = gr.Textbox(label="Recognized Emotion")
    output_df = gr.DataFrame(label="Emotion Probabilities")
    image_output = gr.Image(label="Emotion Image", scale = 1, interactive = False) 
    df_logs = gr.DataFrame(label="Output Logs", headers = ['Timestamp', 'Emotion'])
    def process_audio(audio, emotion, image, state, df_probs, df_logs):
        
        current_time = time.time()
        if state is None or (current_time - state >= 10):
            state = current_time
            emotion, image, df_probs = recognize_emotion(audio)
            # Sample prediction data
            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

            # Create a dictionary for the new row
            new_row = {'Timestamp': timestamp, 'Emotion': emotion}

            # Append the new row to the DataFrame
            df_logs = pd.concat([df_logs, pd.DataFrame([new_row])], ignore_index=True)
            print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Predicted emotion: ", emotion)
            return emotion, image, state, df_probs, df_logs
        else:
            print(datetime.now().strftime('%Y-%m-%d%H:%M:%S'), "Not yet time")
            return emotion, image, state, df_probs, df_logs

    # Automatically call the recognize_em otion function when audio is recorded
    state = gr.State(None)
    audio_input.stream(fn=process_audio, inputs=[audio_input, text_output, image_output, state, output_df, df_logs], outputs=[text_output, image_output, state, output_df, df_logs])
demo.launch(share=True)