jfforero commited on
Commit
7a544e4
1 Parent(s): 0cc05bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -65
app.py CHANGED
@@ -10,12 +10,12 @@ from faster_whisper import WhisperModel
10
 
11
  # Load the emotion prediction model
12
  def load_emotion_model(model_path):
13
- try:
14
- model = load_model(model_path)
15
- return model
16
- except Exception as e:
17
- print("Error loading emotion prediction model:", e)
18
- return None
19
 
20
  model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
21
  model = load_emotion_model(model_path)
@@ -26,84 +26,87 @@ model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
26
 
27
  # Function to transcribe audio
28
  def transcribe(wav_filepath):
29
- segments, _ = model2.transcribe(wav_filepath, beam_size=5)
30
- return "".join([segment.text for segment in segments])
31
 
32
  # Function to extract MFCC features from audio
33
  def extract_mfcc(wav_file_name):
34
- try:
35
- y, sr = librosa.load(wav_file_name)
36
- mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
37
- return mfccs
38
- except Exception as e:
39
- print("Error extracting MFCC features:", e)
40
- return None
41
 
42
  # Emotions dictionary
43
  emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}
44
 
45
  # Function to predict emotion from audio
46
  def predict_emotion_from_audio(wav_filepath):
47
- try:
48
- test_point = extract_mfcc(wav_filepath)
49
- if test_point is not None:
50
- test_point = np.reshape(test_point, newshape=(1, 40, 1))
51
- predictions = model.predict(test_point)
52
- predicted_emotion_label = np.argmax(predictions[0]) + 1
53
- return emotions[predicted_emotion_label]
54
- else:
55
- return "Error: Unable to extract features"
56
- except Exception as e:
57
- print("Error predicting emotion:", e)
58
- return None
59
 
60
  api_key = os.getenv("DeepAI_api_key")
61
 
62
  # Function to generate an image using DeepAI Text to Image API
63
- def generate_image(emotion_prediction, transcribed_text, image_folder='huggingface'):
64
- try:
65
- url = "https://api.deepai.org/api/image-editor"
66
- headers = {'api-key': api_key}
67
- files = {'image': open('TAI_Images/TerraIncognita2.jpg', 'rb'), 'text': emotion_prediction + " " + transcribed_text}
68
- response = requests.post(url, headers=headers, files=files)
69
- response_data = response.json()
70
- if 'output_url' in response_data:
71
- image_url = response_data['output_url']
72
- image_response = requests.get(image_url)
73
- image = Image.open(BytesIO(image_response.content))
74
- # Save image
75
- image.save(os.path.join(image_folder, 'generated_image.jpg'))
76
- return response_data['output_url']
77
- else:
78
- return None
79
- except Exception as e:
80
- print("Error generating image:", e)
81
- return None
82
 
83
 
84
- # Function to get predictions
85
- def get_predictions(audio_input, huggingface_folder='huggingface'):
86
- temp_audio_path = gradio.inputs.FilesMixin.get_file(audio_input, save_path='huggingface')
87
- emotion_prediction = predict_emotion_from_audio(temp_audio_path)
88
- transcribed_text = transcribe(temp_audio_path)
89
- texto_imagen = emotion_prediction + transcribed_text
90
- image = generate_image(api_key, texto_imagen, image_folder=huggingface_folder)
91
- # Save audio
92
- os.rename(temp_audio_path, os.path.join(huggingface_folder, 'input_audio.wav'))
93
- return emotion_prediction, transcribed_text, image
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
 
 
 
 
 
 
 
 
95
 
96
  # Create the Gradio interface
97
  interface = gr.Interface(
98
- fn=get_predictions,
99
- inputs=gr.Audio(label="Input Audio", type="filepath"),
100
- outputs=[
101
- gr.Label("Acoustic Prediction", label="Acoustic Prediction"),
102
- gr.Label("Transcribed Text", label="Transcribed Text"),
103
- gr.Image(type='pil', label="Generated Image")
104
- ],
105
- title="Affective Virtual Environments",
106
- description="Create an AVE using your voice."
107
  )
108
 
 
109
  interface.launch()
 
10
 
11
  # Load the emotion prediction model
12
  def load_emotion_model(model_path):
13
+     try:
14
+         model = load_model(model_path)
15
+         return model
16
+     except Exception as e:
17
+         print("Error loading emotion prediction model:", e)
18
+         return None
19
 
20
  model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
21
  model = load_emotion_model(model_path)
 
26
 
27
  # Function to transcribe audio
28
  def transcribe(wav_filepath):
29
+     segments, _ = model2.transcribe(wav_filepath, beam_size=5)
30
+     return "".join([segment.text for segment in segments])
31
 
32
  # Function to extract MFCC features from audio
33
  def extract_mfcc(wav_file_name):
34
+     try:
35
+         y, sr = librosa.load(wav_file_name)
36
+         mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
37
+         return mfccs
38
+     except Exception as e:
39
+         print("Error extracting MFCC features:", e)
40
+         return None
41
 
42
  # Emotions dictionary
43
  emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}
44
 
45
  # Function to predict emotion from audio
46
  def predict_emotion_from_audio(wav_filepath):
47
+     try:
48
+         test_point = extract_mfcc(wav_filepath)
49
+         if test_point is not None:
50
+             test_point = np.reshape(test_point, newshape=(1, 40, 1))
51
+             predictions = model.predict(test_point)
52
+             predicted_emotion_label = np.argmax(predictions[0]) + 1
53
+             return emotions[predicted_emotion_label]
54
+         else:
55
+             return "Error: Unable to extract features"
56
+     except Exception as e:
57
+         print("Error predicting emotion:", e)
58
+         return None
59
 
60
  api_key = os.getenv("DeepAI_api_key")
61
 
62
  # Function to generate an image using DeepAI Text to Image API
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
+ def generate_image(emotion_prediction, transcribed_text):
66
+     try:
67
+         url = "https://api.deepai.org/api/image-editor"
68
+         headers = {
69
+             'api-key': api_key
70
+         }
71
+         files = {
72
+             'image': open('TAI_Images/TerraIncognita2.jpg', 'rb'),  # Replace 'path_to_your_image.jpg' with the actual path to your image file
73
+             'text': "this"
74
+             # 'text': emotion_prediction + " " + transcribed_text
75
+         }
76
+         response = requests.post(url, headers=headers, files=files)
77
+         response_data = response.json()
78
+         if 'output_url' in response_data:
79
+             image_url = response_data['output_url']  # Fuera
80
+             image_response = requests.get(image_url) # Fuera
81
+             image = Image.open(BytesIO(image_response.content)) # Fuera
82
+             return response_data['output_url']
83
+         else:
84
+             return None
85
+     except Exception as e:
86
+         print("Error generating image:", e)
87
+         return None
88
 
89
+        
90
+ # Function to get predictions
91
+ def get_predictions(audio_input):
92
+     emotion_prediction = predict_emotion_from_audio(audio_input)
93
+     transcribed_text = transcribe(audio_input)
94
+     texto_imagen = emotion_prediction + transcribed_text
95
+     image = generate_image(api_key, texto_imagen)
96
+     return emotion_prediction, transcribed_text, image
97
 
98
  # Create the Gradio interface
99
  interface = gr.Interface(
100
+     fn=get_predictions,
101
+     inputs=gr.Audio(label="Input Audio", type="filepath"),
102
+     outputs=[
103
+         gr.Label("Acoustic Prediction", label="Acoustic Prediction"),
104
+         gr.Label("Transcribed Text", label="Transcribed Text"),
105
+         gr.Image(type='pil', label="Generated Image")
106
+     ],
107
+     title="Affective Virtual Environments",
108
+     description="Create an AVE using your voice."
109
  )
110
 
111
+
112
  interface.launch()