import torch import librosa import numpy as np from torchvision import models from scipy.ndimage import zoom import gradio as gr import pickle from joblib import load # Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3,'kannada':4,'telugu':5} class ANNModel(nn.Module): def __init__(self): super(ANNModel, self).__init__() self.fc1 = nn.Linear(300, 128) self.relu1 = nn.ReLU() self.fc2 = nn.Linear(128, 64) self.relu2 = nn.ReLU() self.fc3 = nn.Linear(64, 6) def forward(self, x): x = self.fc1(x) x = self.relu1(x) x = self.fc2(x) x = self.relu2(x) x = self.fc3(x) return x # Create an instance of your model ann_model = ANNModel() # Load the trained model ann_model.load_state_dict(torch.load('ann_model.pth')) # Load the PCA instance pca = load('pca.pkl') vgg16 = models.vgg16(pretrained=True).features # Function to load and preprocess a single audio file def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance): # Your existing preprocessing code goes here y, sr = librosa.load(audio_file, sr=None) # Load audio mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) # Compute Mel spectrogram log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # Apply log transformation norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) # Normalize # Resize mel spectrogram to the target shape (128, 128) using zoom target_shape = (128, 128) resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest') # Stack the resized mel spectrogram along the third axis to create 3 channels mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1) # Convert the preprocessed audio data into a format suitable for the VGG16 model mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() # Add batch dimension and change channel order # Extract features using VGG16 vgg16_model.eval() with torch.no_grad(): features = vgg16_model(mel_spec_tensor) # Convert the features to numpy array and flatten them features_np = features.squeeze().detach().numpy() features_flattened = features_np.flatten().reshape(1, -1) # Apply PCA transformation features_pca = pca_instance.transform(features_flattened) # Convert to PyTorch tensor features_tensor = torch.from_numpy(features_pca).float() return features_tensor def predict_language(audio_file_path): # Load VGG16 model # Preprocess the single audio file using VGG16 for feature extraction preprocessed_features = preprocess_single_audio_vgg16(audio_file_path, vgg16, pca) # Make predictions using the trained model ann_model.eval() with torch.no_grad(): output = ann_model(preprocessed_features) _, predicted_class = torch.max(output, 1) # Map predicted class index to actual label predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()] return predicted_label iface = gr.Interface(fn=predict_language, inputs="file", outputs="text") iface.launch()