Spaces:
Sleeping
Sleeping
File size: 3,755 Bytes
1d6f96a d0263ad 3fdea53 37e33c3 3c19287 b3ece58 d97a50e 1d6f96a b3ece58 670552f ed83206 670552f f2b7d46 d97a50e fc15cbc f2b7d46 e2ed0ec d97a50e ac02da5 ed83206 f2b7d46 b3ece58 e2ed0ec ed83206 e2ed0ec b3ece58 e2ed0ec b3ece58 e2ed0ec ed83206 f77d9ca 26e1011 dc2b70d 26e1011 1f172ae 8752da5 26e1011 dc2b70d 26e1011 dc2b70d 702dba1 26e1011 8752da5 f77d9ca 26e1011 7a544e4 60280de 8752da5 60280de 8752da5 0cc05bd f679e15 d97a50e 8752da5 dc1b3f3 9524e11 26e1011 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import gradio as gr
import numpy as np
import librosa
import requests
from io import BytesIO
from PIL import Image
import os
from tensorflow.keras.models import load_model
from faster_whisper import WhisperModel
# Load the emotion prediction model
def load_emotion_model(model_path):
try:
model = load_model(model_path)
return model
except Exception as e:
print("Error loading emotion prediction model:", e)
return None
model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
model = load_emotion_model(model_path)
# Initialize WhisperModel
model_size = "small"
model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
# Function to transcribe audio
def transcribe(wav_filepath):
segments, _ = model2.transcribe(wav_filepath, beam_size=5)
return "".join([segment.text for segment in segments])
# Function to extract MFCC features from audio
def extract_mfcc(wav_file_name):
try:
y, sr = librosa.load(wav_file_name)
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
return mfccs
except Exception as e:
print("Error extracting MFCC features:", e)
return None
# Emotions dictionary
emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}
# Function to predict emotion from audio
def predict_emotion_from_audio(wav_filepath):
try:
test_point = extract_mfcc(wav_filepath)
if test_point is not None:
test_point = np.reshape(test_point, newshape=(1, 40, 1))
predictions = model.predict(test_point)
predicted_emotion_label = np.argmax(predictions[0]) + 1
return emotions[predicted_emotion_label]
else:
return "Error: Unable to extract features"
except Exception as e:
print("Error predicting emotion:", e)
return None
api_key = os.getenv("DeepAI_api_key")
# Function to generate an image using DeepAI Text to Image API
import random
def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)):
try:
url = "https://api.deepai.org/api/image-editor"
headers = {
'api-key': api_key
}
# Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg
image_file_path = f'TAI_Images/TerraIncognita{random.randint(0, 9)}.jpg'
files = {
'image': open(image_file_path, 'rb'),
'text': "add " + emotion_prediction + " monsters and asemic writing of " + transcribed_text + "using an old map style"
}
response = requests.post(url, headers=headers, files=files)
response_data = response.json()
if 'output_url' in response_data:
return response_data['output_url']
else:
return None
except Exception as e:
print("Error generating image:", e)
return None
# Function to get predictions
def get_predictions(audio_input):
emotion_prediction = predict_emotion_from_audio(audio_input)
transcribed_text = transcribe(audio_input)
texto_imagen = emotion_prediction + transcribed_text
image = generate_image(api_key, texto_imagen)
return emotion_prediction, transcribed_text, image
# Create the Gradio interface
interface = gr.Interface(
fn=get_predictions,
inputs=gr.Audio(label="Input Audio", type="filepath"),
outputs=[
gr.Label("Acoustic Prediction", label="Acoustic Prediction"),
gr.Label("Transcribed Text", label="Transcribed Text"),
gr.Image(type='pil', label="Generated Image")
],
title="Affective Virtual Environments",
description="Create an AVE using your voice."
)
interface.launch() |