File size: 3,860 Bytes
1d6f96a
 
 
d0263ad
3fdea53
37e33c3
3c19287
b3ece58
d97a50e
1d6f96a
b3ece58
670552f
ed83206
 
 
 
 
 
670552f
f2b7d46
 
 
d97a50e
fc15cbc
f2b7d46
e2ed0ec
d97a50e
ac02da5
ed83206
 
f2b7d46
b3ece58
e2ed0ec
ed83206
 
 
 
 
 
 
e2ed0ec
b3ece58
e2ed0ec
 
b3ece58
e2ed0ec
ed83206
 
 
 
 
 
 
 
 
 
 
 
 
f77d9ca
 
 
26e1011
 
 
 
 
dc2b70d
26e1011
1f172ae
 
8752da5
 
26e1011
 
 
dc2b70d
 
26e1011
dc2b70d
3cedbc6
0d921ec
26e1011
8752da5
 
 
 
 
 
 
 
 
f77d9ca
26e1011
7a544e4
60280de
8752da5
 
 
60280de
8752da5
0cc05bd
f679e15
d97a50e
8752da5
0d921ec
8752da5
 
 
 
 
 
 
dc1b3f3
9524e11
26e1011
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import numpy as np
import librosa
import requests
from io import BytesIO
from PIL import Image
import os
from tensorflow.keras.models import load_model
from faster_whisper import WhisperModel

# Load the emotion prediction model
def load_emotion_model(model_path):
    try:
        model = load_model(model_path)
        return model
    except Exception as e:
        print("Error loading emotion prediction model:", e)
        return None

model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
model = load_emotion_model(model_path)

# Initialize WhisperModel
model_size = "small"
model2 = WhisperModel(model_size, device="cpu", compute_type="int8")

# Function to transcribe audio
def transcribe(wav_filepath):
    segments, _ = model2.transcribe(wav_filepath, beam_size=5)
    return "".join([segment.text for segment in segments])

# Function to extract MFCC features from audio
def extract_mfcc(wav_file_name):
    try:
        y, sr = librosa.load(wav_file_name)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
        return mfccs
    except Exception as e:
        print("Error extracting MFCC features:", e)
        return None

# Emotions dictionary
emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}

# Function to predict emotion from audio
def predict_emotion_from_audio(wav_filepath):
    try:
        test_point = extract_mfcc(wav_filepath)
        if test_point is not None:
            test_point = np.reshape(test_point, newshape=(1, 40, 1))
            predictions = model.predict(test_point)
            predicted_emotion_label = np.argmax(predictions[0]) + 1
            return emotions[predicted_emotion_label]
        else:
            return "Error: Unable to extract features"
    except Exception as e:
        print("Error predicting emotion:", e)
        return None

api_key = os.getenv("DeepAI_api_key")

# Function to generate an image using DeepAI Text to Image API





import random

def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)):

    try:
        url = "https://api.deepai.org/api/image-editor"
        headers = {
            'api-key': api_key
        }
        # Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg
        image_file_path = f'TAI_Images/TerraIncognita{random.randint(0, 9)}.jpg'
        files = {
            'image': open(image_file_path, 'rb'),
             'text': "Generate Patagonian Monsters' with a " + emotion_prediction + " attitude, representing the idea of: [ "+ transcribed_text + "]. Illustrate this using asemic writings in an old map style." 

        }
        response = requests.post(url, headers=headers, files=files)
        response_data = response.json()
        if 'output_url' in response_data:
            return response_data['output_url']
        else:
            return None
    except Exception as e:
        print("Error generating image:", e)
        return None

        
# Function to get predictions
def get_predictions(audio_input):
    emotion_prediction = predict_emotion_from_audio(audio_input)
    transcribed_text = transcribe(audio_input)
    texto_imagen = emotion_prediction + transcribed_text
    image = generate_image(api_key, texto_imagen)
    return emotion_prediction, transcribed_text, image

# Create the Gradio interface
interface = gr.Interface(
    fn=get_predictions,
    inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]), 
    outputs=[
        gr.Label("Acoustic Prediction", label="Acoustic Prediction"),
        gr.Label("Transcribed Text", label="Transcribed Text"),
        gr.Image(type='pil', label="Generated Image")
    ],
    title="Affective Virtual Environments",
    description="Create an AVE using your voice."
)


interface.launch()