import torch
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
import gradio as gr
import pickle
from joblib import load


# Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code
language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3,'kannada':4,'telugu':5}

class ANNModel(nn.Module):
    def __init__(self):
        super(ANNModel, self).__init__()
        self.fc1 = nn.Linear(300, 128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 6)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# Create an instance of your model
ann_model = ANNModel()

# Load the trained model
ann_model.load_state_dict(torch.load('ann_model.pth'))

# Load the PCA instance
pca = load('pca.pkl')

vgg16 = models.vgg16(pretrained=True).features
# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance):
    # Your existing preprocessing code goes here
    y, sr = librosa.load(audio_file, sr=None)  # Load audio
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)  # Compute Mel spectrogram
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Apply log transformation
    norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec)  # Normalize

    # Resize mel spectrogram to the target shape (128, 128) using zoom
    target_shape = (128, 128)
    resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')

    # Stack the resized mel spectrogram along the third axis to create 3 channels
    mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)

    # Convert the preprocessed audio data into a format suitable for the VGG16 model
    mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float()  # Add batch dimension and change channel order

    # Extract features using VGG16
    vgg16_model.eval()
    with torch.no_grad():
        features = vgg16_model(mel_spec_tensor)

    # Convert the features to numpy array and flatten them
    features_np = features.squeeze().detach().numpy()
    features_flattened = features_np.flatten().reshape(1, -1)

    # Apply PCA transformation
    features_pca = pca_instance.transform(features_flattened)

    # Convert to PyTorch tensor
    features_tensor = torch.from_numpy(features_pca).float()
    return features_tensor

def predict_language(audio_file_path):
    # Load VGG16 model
    

    # Preprocess the single audio file using VGG16 for feature extraction
    preprocessed_features = preprocess_single_audio_vgg16(audio_file_path, vgg16, pca)

    # Make predictions using the trained model
    ann_model.eval()
    with torch.no_grad():
        output = ann_model(preprocessed_features)
        _, predicted_class = torch.max(output, 1)

    # Map predicted class index to actual label
    predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]

    return predicted_label

iface = gr.Interface(fn=predict_language, inputs="file", outputs="text")
iface.launch()