File size: 5,607 Bytes
ab4fd59
18a71f5
 
 
b431c05
f731962
18a71f5
 
 
 
 
f731962
18a71f5
f731962
18a71f5
 
c5cc96b
18a71f5
 
c5cc96b
 
18a71f5
 
c5cc96b
18a71f5
 
c5cc96b
18a71f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b5af99
2ffd189
fee0512
 
 
2ffd189
 
 
c5cc96b
18a71f5
2ffd189
18a71f5
 
fee0512
18a71f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f731962
18a71f5
 
f731962
 
 
 
18a71f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fee0512
18a71f5
 
c5cc96b
 
18a71f5
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
import numpy as np
import librosa
import cv2
import json
import ffmpeg
import speech_recognition as sr
from transformers import AutoModelForCausalLM, AutoTokenizer
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import img_to_array
from collections import Counter
import os

# Load necessary models and files
text_model = load_model('model_for_text_emotion_updated(1).keras')  # Text emotion model
with open('tokenizer.json') as json_file:
    tokenizer = tokenizer_from_json(json.load(json_file))  # Tokenizer for text emotion
audio_model = load_model('my_model.h5')  # Audio emotion model
image_model = load_model('model_emotion.h5')  # Image emotion model

# Load LLM model from Hugging Face
llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")  # Small OPT model
llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

# Emotion mapping
emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}

# Preprocess text for emotion prediction
def preprocess_text(text):
    tokens = [word for word in text.lower().split() if word.isalnum()]
    return ' '.join(tokens)

# Predict emotion from text
def predict_text_emotion(text):
    preprocessed_text = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([preprocessed_text])
    padded_seq = pad_sequences(seq, maxlen=35)
    prediction = text_model.predict(padded_seq)
    emotion_index = np.argmax(prediction)
    return emotion_mapping[emotion_index]

# Extract audio features and predict emotion
def extract_audio_features(audio_data, sample_rate):
    if not isinstance(audio_data, np.ndarray):
        audio_data = np.array(audio_data, dtype=np.float32)  # Ensure it's a NumPy array with float type
    else:
        audio_data = audio_data.astype(np.float32)  # Convert to float32

    mfcc = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=704)
    mfcc = np.mean(mfcc.T, axis=0)  # Compute mean across time
    features = np.expand_dims(mfcc, axis=0)  # Add batch dimension
    return features


def predict_audio_emotion(audio_data, sample_rate):
    features = extract_audio_features(audio_data, sample_rate)
    features = np.reshape(features, (1, 40))  # Match model expected input
    prediction = audio_model.predict(features)
    emotion_index = np.argmax(prediction)
    return emotion_mapping[emotion_index]

# Process video and predict emotions from frames
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    predictions = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(frame_rate) == 0:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frame = cv2.resize(frame, (48, 48))
            frame = img_to_array(frame) / 255.0
            frame = np.expand_dims(frame, axis=0)
            prediction = image_model.predict(frame)
            predictions.append(np.argmax(prediction))

    cap.release()
    most_common_emotion = Counter(predictions).most_common(1)[0][0]
    return emotion_mapping[most_common_emotion]

# Extract audio from video using ffmpeg-python
def extract_audio_from_video(video_path):
    audio_file = 'audio.wav'
    (ffmpeg
        .input(video_path)
        .output(audio_file, format='wav', acodec='pcm_s16le')
        .run(overwrite_output=True))
    return audio_file

def transcribe_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_record = recognizer.record(source)
        return recognizer.recognize_google(audio_record)

# Integrating with LLM to adjust responses based on detected emotion
def interact_with_llm(emotion, user_input):
    prompt = f"The user is feeling {emotion}. Respond to their question in an empathetic and appropriate manner: {user_input}"
    
    inputs = llama_tokenizer(prompt, return_tensors="pt")
    outputs = llama_model.generate(**inputs, max_length=200)
    response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

# Main function to process video and predict emotions
def transcribe_and_predict_video(video_path):
    # Extract audio from video and predict text-based emotion
    audio_file = extract_audio_from_video(video_path)
    text = transcribe_audio(audio_file)
    text_emotion = predict_text_emotion(text)

    # Predict emotion from video frames (image-based)
    image_emotion = process_video(video_path)

    # Predict emotion from audio (sound-based)
    audio_data, sample_rate = librosa.load(audio_file, sr=None)
    audio_emotion = predict_audio_emotion(audio_data, sample_rate)

    # Combine detected emotions for final output (majority voting can be implemented)
    final_emotion = image_emotion  # Using image emotion as primary

    # Get response from LLM
    llm_response = interact_with_llm(final_emotion, text)

    return f"Emotion Detected: {final_emotion}\nLLM Response: {llm_response}"

# Create Gradio interface
iface = gr.Interface(fn=transcribe_and_predict_video, 
                     inputs=gr.Video(), 
                     outputs="text", 
                     title="Emotion-Responsive LLM for Video",
                     description="Upload a video to get emotion predictions and LLM responses based on detected emotions.")

iface.launch()