Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -250,18 +250,44 @@ def process_audio_from_video(video_path):
|
|
250 |
|
251 |
|
252 |
|
253 |
-
|
254 |
-
import torch
|
255 |
import gradio as gr
|
256 |
-
|
257 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
|
|
|
258 |
|
259 |
-
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
|
260 |
-
model = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
|
261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
def transcribe_and_predict_video(video, chat_history=[]):
|
264 |
-
# Process the video for emotions
|
265 |
image_emotion = process_video(video)
|
266 |
text_emotion, audio_emotion, user_input = process_audio_from_video(video)
|
267 |
em = [image_emotion, text_emotion, audio_emotion]
|
@@ -272,12 +298,12 @@ def transcribe_and_predict_video(video, chat_history=[]):
|
|
272 |
# Construct the prompt with emotion context and history
|
273 |
prompt = f"""
|
274 |
You are a helpful AI assistant. Respond like a human while considering the user's emotion.
|
275 |
-
|
276 |
User's Emotion: {em}
|
277 |
-
|
278 |
Conversation History:
|
279 |
{history_text}
|
280 |
-
|
281 |
User ({em}): {user_input}
|
282 |
Bot:"""
|
283 |
|
@@ -288,7 +314,7 @@ def transcribe_and_predict_video(video, chat_history=[]):
|
|
288 |
output = model.generate(**inputs, max_length=512, temperature=0.7, top_p=0.9, do_sample=True)
|
289 |
response = tokenizer.decode(output[0], skip_special_tokens=True).split("Bot:")[-1].strip()
|
290 |
|
291 |
-
# Store the current emotion for the user input (
|
292 |
emotion = detect_emotion(user_input) # Assuming `detect_emotion` is a function that returns the user's emotion
|
293 |
|
294 |
# Update the chat history with the current conversation and emotion
|
@@ -296,13 +322,22 @@ def transcribe_and_predict_video(video, chat_history=[]):
|
|
296 |
|
297 |
return response, chat_history
|
298 |
|
299 |
-
# Create Gradio interface
|
300 |
-
iface = gr.Interface(fn=transcribe_and_predict_video,
|
301 |
-
inputs=gr.Video(),
|
302 |
-
outputs="text",
|
303 |
-
title="Multimodal Emotion Recognition from Video",
|
304 |
-
description="Upload a video to get text, audio, and image emotion predictions.")
|
305 |
|
306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
|
308 |
|
|
|
250 |
|
251 |
|
252 |
|
|
|
|
|
253 |
import gradio as gr
|
254 |
+
from huggingface_hub import InferenceClient
|
255 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
256 |
+
from huggingface_hub import InferenceClient
|
257 |
+
|
258 |
+
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
|
259 |
|
|
|
|
|
260 |
|
261 |
+
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
|
262 |
+
messages = [{"role": "system", "content": system_message}]
|
263 |
+
|
264 |
+
# Format history with user and bot messages
|
265 |
+
for val in history:
|
266 |
+
if val[0]:
|
267 |
+
messages.append({"role": "user", "content": val[0]})
|
268 |
+
if val[1]:
|
269 |
+
messages.append({"role": "assistant", "content": val[1]})
|
270 |
+
|
271 |
+
messages.append({"role": "user", "content": message})
|
272 |
|
273 |
+
response = ""
|
274 |
+
|
275 |
+
# Stream response from the model
|
276 |
+
for message in client.chat_completion(
|
277 |
+
messages,
|
278 |
+
max_tokens=max_tokens,
|
279 |
+
stream=True,
|
280 |
+
temperature=temperature,
|
281 |
+
top_p=top_p,
|
282 |
+
):
|
283 |
+
token = message.choices[0].delta.content
|
284 |
+
response += token
|
285 |
+
yield response
|
286 |
+
|
287 |
+
|
288 |
+
# Function to handle video processing and interaction
|
289 |
def transcribe_and_predict_video(video, chat_history=[]):
|
290 |
+
# Process the video for emotions (use your own emotion detection functions)
|
291 |
image_emotion = process_video(video)
|
292 |
text_emotion, audio_emotion, user_input = process_audio_from_video(video)
|
293 |
em = [image_emotion, text_emotion, audio_emotion]
|
|
|
298 |
# Construct the prompt with emotion context and history
|
299 |
prompt = f"""
|
300 |
You are a helpful AI assistant. Respond like a human while considering the user's emotion.
|
301 |
+
|
302 |
User's Emotion: {em}
|
303 |
+
|
304 |
Conversation History:
|
305 |
{history_text}
|
306 |
+
|
307 |
User ({em}): {user_input}
|
308 |
Bot:"""
|
309 |
|
|
|
314 |
output = model.generate(**inputs, max_length=512, temperature=0.7, top_p=0.9, do_sample=True)
|
315 |
response = tokenizer.decode(output[0], skip_special_tokens=True).split("Bot:")[-1].strip()
|
316 |
|
317 |
+
# Store the current emotion for the user input (modify emotion detection as needed)
|
318 |
emotion = detect_emotion(user_input) # Assuming `detect_emotion` is a function that returns the user's emotion
|
319 |
|
320 |
# Update the chat history with the current conversation and emotion
|
|
|
322 |
|
323 |
return response, chat_history
|
324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
|
326 |
+
# Gradio interface setup
|
327 |
+
demo = gr.ChatInterface(
|
328 |
+
respond,
|
329 |
+
additional_inputs=[
|
330 |
+
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
331 |
+
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
332 |
+
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
333 |
+
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
334 |
+
],
|
335 |
+
)
|
336 |
+
|
337 |
+
# Launch the Gradio interface
|
338 |
+
if __name__ == "__main__":
|
339 |
+
demo.launch()
|
340 |
+
|
341 |
+
|
342 |
|
343 |
|