Garvitj commited on
Commit
c5cc96b
·
verified ·
1 Parent(s): 9b5af99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -18
app.py CHANGED
@@ -6,7 +6,6 @@ import json
6
  import ffmpeg
7
  import speech_recognition as sr
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
9
- import tensorflow as tf
10
  from tensorflow.keras.preprocessing.text import tokenizer_from_json
11
  from tensorflow.keras.models import load_model
12
  from tensorflow.keras.preprocessing.sequence import pad_sequences
@@ -15,17 +14,17 @@ from collections import Counter
15
  import os
16
 
17
  # Load necessary models and files
18
- text_model = load_model('model_for_text_emotion_updated(1).keras') # Load text emotion model
19
  with open('tokenizer.json') as json_file:
20
  tokenizer = tokenizer_from_json(json.load(json_file)) # Tokenizer for text emotion
21
- audio_model = load_model('my_model.h5') # Load audio emotion model
22
- image_model = load_model('model_emotion.h5') # Load image emotion model
23
 
24
  # Load LLM model from Hugging Face
25
- llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") # Example: small OPT model
26
  llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
27
 
28
- # Emotion mapping (from your model output)
29
  emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
30
 
31
  # Preprocess text for emotion prediction
@@ -45,10 +44,12 @@ def predict_text_emotion(text):
45
  # Extract audio features and predict emotion
46
  def extract_audio_features(audio_data, sample_rate):
47
  if not isinstance(audio_data, np.ndarray):
48
- audio_data = np.array(audio_data) # Ensure it's a NumPy array
49
-
50
- mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate).T, axis=0)
51
- return np.expand_dims(mfcc, axis=0)
 
 
52
 
53
  def predict_audio_emotion(audio_data, sample_rate):
54
  features = extract_audio_features(audio_data, sample_rate)
@@ -114,16 +115,11 @@ def transcribe_and_predict_video(video_path):
114
  image_emotion = process_video(video_path)
115
 
116
  # Predict emotion from audio (sound-based)
117
- audio_data, sample_rate = librosa.load(audio_file, sr=None)
118
-
119
- # Debugging print statements
120
- print(f"Type of audio_data: {type(audio_data)}") # Ensure audio_data is numpy.ndarray
121
- print(f"Sample rate: {sample_rate}")
122
-
123
  audio_emotion = predict_audio_emotion(audio_data, sample_rate)
124
 
125
- # Combine the detected emotions for final output (you could average them or choose the most common)
126
- final_emotion = image_emotion # Or decide based on some logic (e.g., majority vote)
127
 
128
  # Get response from LLM
129
  llm_response = interact_with_llm(final_emotion, text)
 
6
  import ffmpeg
7
  import speech_recognition as sr
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
9
  from tensorflow.keras.preprocessing.text import tokenizer_from_json
10
  from tensorflow.keras.models import load_model
11
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
14
  import os
15
 
16
  # Load necessary models and files
17
+ text_model = load_model('model_for_text_emotion_updated(1).keras') # Text emotion model
18
  with open('tokenizer.json') as json_file:
19
  tokenizer = tokenizer_from_json(json.load(json_file)) # Tokenizer for text emotion
20
+ audio_model = load_model('my_model.h5') # Audio emotion model
21
+ image_model = load_model('model_emotion.h5') # Image emotion model
22
 
23
  # Load LLM model from Hugging Face
24
+ llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") # Small OPT model
25
  llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
26
 
27
+ # Emotion mapping
28
  emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
29
 
30
  # Preprocess text for emotion prediction
 
44
  # Extract audio features and predict emotion
45
  def extract_audio_features(audio_data, sample_rate):
46
  if not isinstance(audio_data, np.ndarray):
47
+ audio_data = np.array(audio_data)
48
+
49
+ mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=704).T, axis=0)
50
+ features = np.expand_dims(mfcc, axis=0)
51
+ features = np.reshape(features, (1, 704))
52
+ return features
53
 
54
  def predict_audio_emotion(audio_data, sample_rate):
55
  features = extract_audio_features(audio_data, sample_rate)
 
115
  image_emotion = process_video(video_path)
116
 
117
  # Predict emotion from audio (sound-based)
118
+ sample_rate, audio_data = librosa.load(audio_file, sr=None)
 
 
 
 
 
119
  audio_emotion = predict_audio_emotion(audio_data, sample_rate)
120
 
121
+ # Combine detected emotions for final output (majority voting can be implemented)
122
+ final_emotion = image_emotion # Using image emotion as primary
123
 
124
  # Get response from LLM
125
  llm_response = interact_with_llm(final_emotion, text)