Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import librosa | |
import requests | |
from io import BytesIO | |
from PIL import Image | |
import os | |
from tensorflow.keras.models import load_model | |
from faster_whisper import WhisperModel | |
# Load the emotion prediction model | |
def load_emotion_model(model_path): | |
try: | |
model = load_model(model_path) | |
return model | |
except Exception as e: | |
print("Error loading emotion prediction model:", e) | |
return None | |
model_path = 'mymodel_SER_LSTM_RAVDESS.h5' | |
model = load_emotion_model(model_path) | |
# Initialize WhisperModel | |
model_size = "small" | |
model2 = WhisperModel(model_size, device="cpu", compute_type="int8") | |
# Function to transcribe audio | |
def transcribe(wav_filepath): | |
segments, _ = model2.transcribe(wav_filepath, beam_size=5) | |
return "".join([segment.text for segment in segments]) | |
# Function to extract MFCC features from audio | |
def extract_mfcc(wav_file_name): | |
try: | |
y, sr = librosa.load(wav_file_name) | |
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0) | |
return mfccs | |
except Exception as e: | |
print("Error extracting MFCC features:", e) | |
return None | |
# Emotions dictionary | |
emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'} | |
# Function to predict emotion from audio | |
def predict_emotion_from_audio(wav_filepath): | |
try: | |
test_point = extract_mfcc(wav_filepath) | |
if test_point is not None: | |
test_point = np.reshape(test_point, newshape=(1, 40, 1)) | |
predictions = model.predict(test_point) | |
predicted_emotion_label = np.argmax(predictions[0]) + 1 | |
return emotions[predicted_emotion_label] | |
else: | |
return "Error: Unable to extract features" | |
except Exception as e: | |
print("Error predicting emotion:", e) | |
return None | |
api_key = os.getenv("DeepAI_api_key") | |
# Function to generate an image using DeepAI Text to Image API | |
import random | |
def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)): | |
try: | |
url = "https://api.deepai.org/api/image-editor" | |
headers = { | |
'api-key': api_key | |
} | |
# Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg | |
image_file_path = f'TAI_Images/TerraIncognita{random.randint(0, 9)}.jpg' | |
files = { | |
'image': open(image_file_path, 'rb'), | |
'text': "add " + emotion_prediction + " monsters and asemic writing of " + transcribed_text + "using an old map style" | |
} | |
response = requests.post(url, headers=headers, files=files) | |
response_data = response.json() | |
if 'output_url' in response_data: | |
return response_data['output_url'] | |
else: | |
return None | |
except Exception as e: | |
print("Error generating image:", e) | |
return None | |
# Function to get predictions | |
def get_predictions(audio_input): | |
emotion_prediction = predict_emotion_from_audio(audio_input) | |
transcribed_text = transcribe(audio_input) | |
texto_imagen = emotion_prediction + transcribed_text | |
image = generate_image(api_key, texto_imagen) | |
return emotion_prediction, transcribed_text, image | |
# Create the Gradio interface | |
interface = gr.Interface( | |
fn=get_predictions, | |
inputs=gr.Audio(label="Input Audio", type="filepath"), | |
outputs=[ | |
gr.Label("Acoustic Prediction", label="Acoustic Prediction"), | |
gr.Label("Transcribed Text", label="Transcribed Text"), | |
gr.Image(type='pil', label="Generated Image") | |
], | |
title="Affective Virtual Environments", | |
description="Create an AVE using your voice." | |
) | |
interface.launch() |