Spaces:
Sleeping
Sleeping
File size: 5,607 Bytes
ab4fd59 18a71f5 b431c05 f731962 18a71f5 f731962 18a71f5 f731962 18a71f5 c5cc96b 18a71f5 c5cc96b 18a71f5 c5cc96b 18a71f5 c5cc96b 18a71f5 9b5af99 2ffd189 fee0512 2ffd189 c5cc96b 18a71f5 2ffd189 18a71f5 fee0512 18a71f5 f731962 18a71f5 f731962 18a71f5 fee0512 18a71f5 c5cc96b 18a71f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import numpy as np
import librosa
import cv2
import json
import ffmpeg
import speech_recognition as sr
from transformers import AutoModelForCausalLM, AutoTokenizer
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import img_to_array
from collections import Counter
import os
# Load necessary models and files
text_model = load_model('model_for_text_emotion_updated(1).keras') # Text emotion model
with open('tokenizer.json') as json_file:
tokenizer = tokenizer_from_json(json.load(json_file)) # Tokenizer for text emotion
audio_model = load_model('my_model.h5') # Audio emotion model
image_model = load_model('model_emotion.h5') # Image emotion model
# Load LLM model from Hugging Face
llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") # Small OPT model
llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
# Emotion mapping
emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
# Preprocess text for emotion prediction
def preprocess_text(text):
tokens = [word for word in text.lower().split() if word.isalnum()]
return ' '.join(tokens)
# Predict emotion from text
def predict_text_emotion(text):
preprocessed_text = preprocess_text(text)
seq = tokenizer.texts_to_sequences([preprocessed_text])
padded_seq = pad_sequences(seq, maxlen=35)
prediction = text_model.predict(padded_seq)
emotion_index = np.argmax(prediction)
return emotion_mapping[emotion_index]
# Extract audio features and predict emotion
def extract_audio_features(audio_data, sample_rate):
if not isinstance(audio_data, np.ndarray):
audio_data = np.array(audio_data, dtype=np.float32) # Ensure it's a NumPy array with float type
else:
audio_data = audio_data.astype(np.float32) # Convert to float32
mfcc = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=704)
mfcc = np.mean(mfcc.T, axis=0) # Compute mean across time
features = np.expand_dims(mfcc, axis=0) # Add batch dimension
return features
def predict_audio_emotion(audio_data, sample_rate):
features = extract_audio_features(audio_data, sample_rate)
features = np.reshape(features, (1, 40)) # Match model expected input
prediction = audio_model.predict(features)
emotion_index = np.argmax(prediction)
return emotion_mapping[emotion_index]
# Process video and predict emotions from frames
def process_video(video_path):
cap = cv2.VideoCapture(video_path)
frame_rate = cap.get(cv2.CAP_PROP_FPS)
predictions = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(frame_rate) == 0:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
frame = cv2.resize(frame, (48, 48))
frame = img_to_array(frame) / 255.0
frame = np.expand_dims(frame, axis=0)
prediction = image_model.predict(frame)
predictions.append(np.argmax(prediction))
cap.release()
most_common_emotion = Counter(predictions).most_common(1)[0][0]
return emotion_mapping[most_common_emotion]
# Extract audio from video using ffmpeg-python
def extract_audio_from_video(video_path):
audio_file = 'audio.wav'
(ffmpeg
.input(video_path)
.output(audio_file, format='wav', acodec='pcm_s16le')
.run(overwrite_output=True))
return audio_file
def transcribe_audio(audio_file):
recognizer = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio_record = recognizer.record(source)
return recognizer.recognize_google(audio_record)
# Integrating with LLM to adjust responses based on detected emotion
def interact_with_llm(emotion, user_input):
prompt = f"The user is feeling {emotion}. Respond to their question in an empathetic and appropriate manner: {user_input}"
inputs = llama_tokenizer(prompt, return_tensors="pt")
outputs = llama_model.generate(**inputs, max_length=200)
response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
# Main function to process video and predict emotions
def transcribe_and_predict_video(video_path):
# Extract audio from video and predict text-based emotion
audio_file = extract_audio_from_video(video_path)
text = transcribe_audio(audio_file)
text_emotion = predict_text_emotion(text)
# Predict emotion from video frames (image-based)
image_emotion = process_video(video_path)
# Predict emotion from audio (sound-based)
audio_data, sample_rate = librosa.load(audio_file, sr=None)
audio_emotion = predict_audio_emotion(audio_data, sample_rate)
# Combine detected emotions for final output (majority voting can be implemented)
final_emotion = image_emotion # Using image emotion as primary
# Get response from LLM
llm_response = interact_with_llm(final_emotion, text)
return f"Emotion Detected: {final_emotion}\nLLM Response: {llm_response}"
# Create Gradio interface
iface = gr.Interface(fn=transcribe_and_predict_video,
inputs=gr.Video(),
outputs="text",
title="Emotion-Responsive LLM for Video",
description="Upload a video to get emotion predictions and LLM responses based on detected emotions.")
iface.launch()
|