Spaces:

Hasti11
/

MultiModal-Mental-Health-Therapist

Running

App Files Files Community

MultiModal-Mental-Health-Therapist / app.py

Hasti11

Update app.py

389c6e6 verified about 1 month ago

raw history blame

No virus

7.88 kB

	import os
	import numpy as np
	from transformers import pipeline
	import speech_recognition as sr
	import gradio as gr
	import cv2
	from PIL import Image
	import moviepy.editor as mp
	from gtts import gTTS
	from groq import Groq
	import re

	client = Groq(
	api_key="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",
	)

	# Initialize pipelines
	image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1)
	audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim")
	text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2)

	conversation_history = []
	# max_history_length = 3

	def process_input(video_stream, conversation_history):
	if isinstance(video_stream, str):
	video_file_path = video_stream

	# Process video frames
	image_features_list = []
	audio_emotion = ""
	text_input = ""
	text_emotions = ""

	cap = cv2.VideoCapture(video_file_path)
	frame_count = 0

	while True:
	ret, frame = cap.read()
	if not ret:
	break

	# Convert frame to PIL image
	pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

	# Analyze the image
	try:
	image_analysis = image_pipeline(pil_image)
	if image_analysis:
	image_features_list.append(image_analysis[0]['label'])
	except Exception as e:
	print(f"Error processing image data: {e}")

	# Increment frame count
	frame_count += 1

	# Combine image features into a single string
	image_features = ', '.join(image_features_list)
	print("Image features:", image_features)

	# Process audio data and get the emotion label
	try:
	# Extract audio from the video file
	video_clip = mp.VideoFileClip(video_file_path)
	audio_file_path = os.path.join("/tmp", "audio.wav")
	video_clip.audio.write_audiofile(audio_file_path)

	recognizer = sr.Recognizer()
	with sr.AudioFile(audio_file_path) as source:
	audio = recognizer.record(source)

	# Convert audio data to numpy array
	audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
	audio_data = audio_data.astype(np.float32) # Convert to float32

	audio_emotions = audio_pipeline(audio_data)
	if audio_emotions:
	audio_emotion = audio_emotions[0]['label']
	print("Audio emotion:", audio_emotion)

	# Recognize audio
	text_input = recognizer.recognize_google(audio)
	print("User said:", text_input)
	except Exception as e:
	print(f"Error processing audio data: {e}")

	# Process text data and get the emotion label
	text_emotions = ""
	try:
	# Initialize text_input in case it's not set
	if not text_input:
	text_input = ""

	text_analysis = text_pipeline(text_input)
	print("text analysis:", text_analysis)

	if isinstance(text_analysis, list):
	# Flatten the list of lists
	text_analysis = [item for sublist in text_analysis for item in sublist]

	# Initialize an empty list to store the text emotions
	text_emotions_list = []

	# Iterate through each item in the flattened list
	for item in text_analysis:
	# Ensure each item is a dictionary and contains the 'label' key
	if isinstance(item, dict) and 'label' in item:
	# Append the 'label' value to the text_emotions_list
	text_emotions_list.append(item['label'])

	# Check if text_emotions_list is empty
	if text_emotions_list:
	# Convert the text_emotions_list to a comma-separated string
	text_emotions = ', '.join(text_emotions_list)
	print("Text emotions:", text_emotions)
	else:
	text_emotions = "No significant emotions detected in the text."

	except Exception as e:
	print(f"Error processing text data: {e}")


	if conversation_history is not None:
	# conversation_history = conversation_history[-max_history_length:] # Keep most recent entries
	conversation_history.append({
	"user_input": text_input,
	"image_features": image_features,
	"audio_emotion": audio_emotion,
	"text_emotions": text_emotions
	})
	else:
	conversation_history = [{
	"user_input": text_input,
	"image_features": image_features,
	"audio_emotion": audio_emotion,
	"text_emotions": text_emotions
	}]

	prompt = "User said: " + text_input
	if image_features:
	prompt += "\nImage features: " + ', '.join(image_features)
	if audio_emotion:
	prompt += "\nAudio emotion: " + audio_emotion
	if text_emotions:
	prompt += "\nText emotions: " + text_emotions


	# Get conversation history text
	history_text = display_history(conversation_history)

	chat_completion = client.chat.completions.create(
	messages=[
	{"role": "system",
	"content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction."
	},
	{"role": "user",
	"content": prompt+history_text
	},
	{"role": "assistant",
	"content": history_text
	}
	],
	model="llama3-70b-8192",
	temperature=0.5,
	max_tokens=1024,
	top_p=1,
	stop=None,
	stream=False,
	)

	ai_response = chat_completion.choices[0].message.content
	conversation_history.append({"ai_response": ai_response})
	print(ai_response)

	# Convert AI response to audio
	tts = gTTS(text=ai_response, lang='en')
	audio_file_path = "/tmp/ai_response.wav"
	tts.save(audio_file_path)

	return ai_response,audio_file_path,conversation_history

	def display_history(conversation_history):
	history_str = ""
	for i, turn in enumerate(conversation_history):
	if "user_input" in turn:
	history_str += f"User: {turn['user_input']}\n"
	if "ai_response" in turn:
	ai_response = turn['ai_response']
	ai_response = re.sub(r'\\', '', ai_response)
	history_str += f"Therapist: {turn['ai_response']}\n\n"
	return history_str


	# Create the Gradio interface
	input_video = gr.Video( label="Your Video", include_audio=True)
	output_text = gr.Textbox(label="Therapist Response")
	output_audio = gr.Audio(autoplay=True, visible=False)


	custom_css = """
	gr.Interface .gradio-title{
	text-align: center;
	font-size: 24px;
	font-weight: bold;
	margin-left:123px;
	}

	gr.Interface .gradio-description {
	text-align: center;
	font-size: 16px;
	margin-top: 10px;
	}
	"""

	description = """
	Speak to the AI through video input and get personalized responses from our mental health therapist. Whether you need guidance, support, or just someone to talk to, our AI is here to help you navigate life's challenges with empathy and understanding.
	"""
	iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text, output_audio], title="MindWave: Real-Time Mental Health Therapist through GenAI and Multimodal Interaction", description=description, theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"), allow_flagging=False,css=custom_css)


	iface.launch()