import gradio as gr import numpy as np import librosa import requests from io import BytesIO from PIL import Image import os from tensorflow.keras.models import load_model from faster_whisper import WhisperModel # Load the emotion prediction model def load_emotion_model(model_path): try: model = load_model(model_path) return model except Exception as e: print("Error loading emotion prediction model:", e) return None model_path = 'mymodel_SER_LSTM_RAVDESS.h5' model = load_emotion_model(model_path) # Initialize WhisperModel model_size = "small" model2 = WhisperModel(model_size, device="cpu", compute_type="int8") # Function to transcribe audio def transcribe(wav_filepath): segments, _ = model2.transcribe(wav_filepath, beam_size=5) return "".join([segment.text for segment in segments]) # Function to extract MFCC features from audio def extract_mfcc(wav_file_name): try: y, sr = librosa.load(wav_file_name) mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0) return mfccs except Exception as e: print("Error extracting MFCC features:", e) return None # Emotions dictionary emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'} # Function to predict emotion from audio def predict_emotion_from_audio(wav_filepath): try: test_point = extract_mfcc(wav_filepath) if test_point is not None: test_point = np.reshape(test_point, newshape=(1, 40, 1)) predictions = model.predict(test_point) predicted_emotion_label = np.argmax(predictions[0]) + 1 return emotions[predicted_emotion_label] else: return "Error: Unable to extract features" except Exception as e: print("Error predicting emotion:", e) return None api_key = os.getenv("DeepAI_api_key") # Function to generate an image using DeepAI Text to Image API def generate_image(api_key, text): url = "https://api.deepai.org/api/text2img" headers = {'api-key': api_key} response = requests.post( url, data={'text': text}, headers=headers ) response_data = response.json() if 'output_url' in response_data: image_url = response_data['output_url'] image_response = requests.get(image_url) image = Image.open(BytesIO(image_response.content)) return image else: return None # Function to get predictions def get_predictions(audio_input): emotion_prediction = predict_emotion_from_audio(audio_input) transcribed_text = transcribe(audio_input) texto_imagen = emotion_prediction + transcribed_text image = generate_image(api_key, texto_imagen) return emotion_prediction, transcribed_text, image # Create the Gradio interface interface = gr.Interface( fn=get_predictions, inputs=gr.Audio(label="Input Audio", type="filepath"), outputs=[ gr.Label("Acoustic Prediction", label="Acoustic Prediction"), gr.Label("Transcribed Text", label="Transcribed Text"), gr.Image(type='pil', label="Generated Image") ], title="Affective Virtual Environments", description="Create an AVE using your voice." ) interface.launch()