Spaces:

Hasti11
/

MultiModal-Mental-Health-Therapist

Running

File size: 7,880 Bytes

import os
import numpy as np
from transformers import pipeline
import speech_recognition as sr
import gradio as gr
import cv2
from PIL import Image
import moviepy.editor as mp
from gtts import gTTS
from groq import Groq
import re

client = Groq(
    api_key="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",
)

# Initialize pipelines
image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1)
audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim")
text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2)

conversation_history = []
# max_history_length = 3

def process_input(video_stream, conversation_history):
    if isinstance(video_stream, str):
        video_file_path = video_stream
     
    # Process video frames
    image_features_list = []
    audio_emotion = ""
    text_input = ""
    text_emotions = ""

    cap = cv2.VideoCapture(video_file_path)
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert frame to PIL image
        pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        # Analyze the image
        try:
            image_analysis = image_pipeline(pil_image)
            if image_analysis:
                image_features_list.append(image_analysis[0]['label'])
        except Exception as e:
            print(f"Error processing image data: {e}")

        # Increment frame count
        frame_count += 1

    # Combine image features into a single string
    image_features = ', '.join(image_features_list)
    print("Image features:", image_features)

    # Process audio data and get the emotion label
    try:
        # Extract audio from the video file
        video_clip = mp.VideoFileClip(video_file_path)
        audio_file_path = os.path.join("/tmp", "audio.wav")
        video_clip.audio.write_audiofile(audio_file_path)

        recognizer = sr.Recognizer()
        with sr.AudioFile(audio_file_path) as source:
            audio = recognizer.record(source)

        # Convert audio data to numpy array
        audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
        audio_data = audio_data.astype(np.float32)  # Convert to float32

        audio_emotions = audio_pipeline(audio_data)
        if audio_emotions:
            audio_emotion = audio_emotions[0]['label']
            print("Audio emotion:", audio_emotion)

            # Recognize audio
            text_input = recognizer.recognize_google(audio)
            print("User said:", text_input)
    except Exception as e:
        print(f"Error processing audio data: {e}")

    # Process text data and get the emotion label
    text_emotions = ""
    try:
        # Initialize text_input in case it's not set
        if not text_input:
            text_input = ""

        text_analysis = text_pipeline(text_input)
        print("text analysis:", text_analysis)

        if isinstance(text_analysis, list):
            # Flatten the list of lists
            text_analysis = [item for sublist in text_analysis for item in sublist]

            # Initialize an empty list to store the text emotions
            text_emotions_list = []

            # Iterate through each item in the flattened list
            for item in text_analysis:
                # Ensure each item is a dictionary and contains the 'label' key
                if isinstance(item, dict) and 'label' in item:
                    # Append the 'label' value to the text_emotions_list
                    text_emotions_list.append(item['label'])

            # Check if text_emotions_list is empty
            if text_emotions_list:
                # Convert the text_emotions_list to a comma-separated string
                text_emotions = ', '.join(text_emotions_list)
                print("Text emotions:", text_emotions)
            else:
                text_emotions = "No significant emotions detected in the text."

    except Exception as e:
        print(f"Error processing text data: {e}")
    
    
    if conversation_history is not None:
        # conversation_history = conversation_history[-max_history_length:]  # Keep most recent entries
        conversation_history.append({
            "user_input": text_input,
            "image_features": image_features,
            "audio_emotion": audio_emotion,
            "text_emotions": text_emotions
        })
    else:
        conversation_history = [{
            "user_input": text_input,
            "image_features": image_features,
            "audio_emotion": audio_emotion,
            "text_emotions": text_emotions
        }]
    
    prompt = "User said: " + text_input
    if image_features:
        prompt += "\nImage features: " + ', '.join(image_features)
    if audio_emotion:
        prompt += "\nAudio emotion: " + audio_emotion
    if text_emotions:
        prompt += "\nText emotions: " + text_emotions
        

    # Get conversation history text
    history_text = display_history(conversation_history)

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system",
                "content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction."
                },
            {"role": "user",
                "content": prompt+history_text
            },
            {"role": "assistant",
                "content": history_text
            }
        ],
        model="llama3-70b-8192",
        temperature=0.5,
        max_tokens=1024,
        top_p=1,
        stop=None,
        stream=False,
    )

    ai_response = chat_completion.choices[0].message.content
    conversation_history.append({"ai_response": ai_response})
    print(ai_response)
    
    # Convert AI response to audio
    tts = gTTS(text=ai_response, lang='en')
    audio_file_path = "/tmp/ai_response.wav"
    tts.save(audio_file_path)
    
    return ai_response,audio_file_path,conversation_history 

def display_history(conversation_history):
    history_str = ""
    for i, turn in enumerate(conversation_history):
        if "user_input" in turn:
            history_str += f"User: {turn['user_input']}\n"
        if "ai_response" in turn:
            ai_response = turn['ai_response']
            ai_response = re.sub(r'\*\*', '', ai_response)
            history_str += f"Therapist: {turn['ai_response']}\n\n"
    return history_str  


# Create the Gradio interface
input_video = gr.Video( label="Your Video", include_audio=True)
output_text = gr.Textbox(label="Therapist Response")
output_audio = gr.Audio(autoplay=True, visible=False)


custom_css = """
gr.Interface .gradio-title{
    text-align: center;
    font-size: 24px;
    font-weight: bold;
    margin-left:123px;
}

gr.Interface .gradio-description {
    text-align: center;
    font-size: 16px;
    margin-top: 10px;
}
"""

description = """
Speak to the AI through video input and get personalized responses from our mental health therapist. Whether you need guidance, support, or just someone to talk to, our AI is here to help you navigate life's challenges with empathy and understanding.
"""
iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text, output_audio], title="MindWave: Real-Time Mental Health Therapist through GenAI and  Multimodal Interaction", description=description, theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"), allow_flagging=False,css=custom_css)


iface.launch()