File size: 7,880 Bytes
90b19b1
 
 
 
 
 
 
 
 
 
d788ce3
90b19b1
 
 
 
 
 
 
 
 
 
 
d788ce3
90b19b1
838123a
90b19b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389c6e6
90b19b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838123a
 
 
d788ce3
838123a
 
 
 
 
 
 
 
 
 
 
 
 
 
90b19b1
 
 
 
 
 
 
 
838123a
 
 
90b19b1
 
 
 
 
 
 
838123a
 
 
 
90b19b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838123a
90b19b1
838123a
90b19b1
 
838123a
 
90b19b1
d788ce3
 
838123a
d788ce3
838123a
90b19b1
 
ff8d654
90b19b1
838123a
90b19b1
 
838123a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90b19b1
 
838123a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import os
import numpy as np
from transformers import pipeline
import speech_recognition as sr
import gradio as gr
import cv2
from PIL import Image
import moviepy.editor as mp
from gtts import gTTS
from groq import Groq
import re

client = Groq(
    api_key="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",
)

# Initialize pipelines
image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1)
audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim")
text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2)

conversation_history = []
# max_history_length = 3

def process_input(video_stream, conversation_history):
    if isinstance(video_stream, str):
        video_file_path = video_stream
     
    # Process video frames
    image_features_list = []
    audio_emotion = ""
    text_input = ""
    text_emotions = ""

    cap = cv2.VideoCapture(video_file_path)
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert frame to PIL image
        pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        # Analyze the image
        try:
            image_analysis = image_pipeline(pil_image)
            if image_analysis:
                image_features_list.append(image_analysis[0]['label'])
        except Exception as e:
            print(f"Error processing image data: {e}")

        # Increment frame count
        frame_count += 1

    # Combine image features into a single string
    image_features = ', '.join(image_features_list)
    print("Image features:", image_features)

    # Process audio data and get the emotion label
    try:
        # Extract audio from the video file
        video_clip = mp.VideoFileClip(video_file_path)
        audio_file_path = os.path.join("/tmp", "audio.wav")
        video_clip.audio.write_audiofile(audio_file_path)

        recognizer = sr.Recognizer()
        with sr.AudioFile(audio_file_path) as source:
            audio = recognizer.record(source)

        # Convert audio data to numpy array
        audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
        audio_data = audio_data.astype(np.float32)  # Convert to float32

        audio_emotions = audio_pipeline(audio_data)
        if audio_emotions:
            audio_emotion = audio_emotions[0]['label']
            print("Audio emotion:", audio_emotion)

            # Recognize audio
            text_input = recognizer.recognize_google(audio)
            print("User said:", text_input)
    except Exception as e:
        print(f"Error processing audio data: {e}")

    # Process text data and get the emotion label
    text_emotions = ""
    try:
        # Initialize text_input in case it's not set
        if not text_input:
            text_input = ""

        text_analysis = text_pipeline(text_input)
        print("text analysis:", text_analysis)

        if isinstance(text_analysis, list):
            # Flatten the list of lists
            text_analysis = [item for sublist in text_analysis for item in sublist]

            # Initialize an empty list to store the text emotions
            text_emotions_list = []

            # Iterate through each item in the flattened list
            for item in text_analysis:
                # Ensure each item is a dictionary and contains the 'label' key
                if isinstance(item, dict) and 'label' in item:
                    # Append the 'label' value to the text_emotions_list
                    text_emotions_list.append(item['label'])

            # Check if text_emotions_list is empty
            if text_emotions_list:
                # Convert the text_emotions_list to a comma-separated string
                text_emotions = ', '.join(text_emotions_list)
                print("Text emotions:", text_emotions)
            else:
                text_emotions = "No significant emotions detected in the text."

    except Exception as e:
        print(f"Error processing text data: {e}")
    
    
    if conversation_history is not None:
        # conversation_history = conversation_history[-max_history_length:]  # Keep most recent entries
        conversation_history.append({
            "user_input": text_input,
            "image_features": image_features,
            "audio_emotion": audio_emotion,
            "text_emotions": text_emotions
        })
    else:
        conversation_history = [{
            "user_input": text_input,
            "image_features": image_features,
            "audio_emotion": audio_emotion,
            "text_emotions": text_emotions
        }]
    
    prompt = "User said: " + text_input
    if image_features:
        prompt += "\nImage features: " + ', '.join(image_features)
    if audio_emotion:
        prompt += "\nAudio emotion: " + audio_emotion
    if text_emotions:
        prompt += "\nText emotions: " + text_emotions
        

    # Get conversation history text
    history_text = display_history(conversation_history)

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system",
                "content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction."
                },
            {"role": "user",
                "content": prompt+history_text
            },
            {"role": "assistant",
                "content": history_text
            }
        ],
        model="llama3-70b-8192",
        temperature=0.5,
        max_tokens=1024,
        top_p=1,
        stop=None,
        stream=False,
    )

    ai_response = chat_completion.choices[0].message.content
    conversation_history.append({"ai_response": ai_response})
    print(ai_response)
    
    # Convert AI response to audio
    tts = gTTS(text=ai_response, lang='en')
    audio_file_path = "/tmp/ai_response.wav"
    tts.save(audio_file_path)
    
    return ai_response,audio_file_path,conversation_history 

def display_history(conversation_history):
    history_str = ""
    for i, turn in enumerate(conversation_history):
        if "user_input" in turn:
            history_str += f"User: {turn['user_input']}\n"
        if "ai_response" in turn:
            ai_response = turn['ai_response']
            ai_response = re.sub(r'\*\*', '', ai_response)
            history_str += f"Therapist: {turn['ai_response']}\n\n"
    return history_str  


# Create the Gradio interface
input_video = gr.Video( label="Your Video", include_audio=True)
output_text = gr.Textbox(label="Therapist Response")
output_audio = gr.Audio(autoplay=True, visible=False)


custom_css = """
gr.Interface .gradio-title{
    text-align: center;
    font-size: 24px;
    font-weight: bold;
    margin-left:123px;
}

gr.Interface .gradio-description {
    text-align: center;
    font-size: 16px;
    margin-top: 10px;
}
"""

description = """
Speak to the AI through video input and get personalized responses from our mental health therapist. Whether you need guidance, support, or just someone to talk to, our AI is here to help you navigate life's challenges with empathy and understanding.
"""
iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text, output_audio], title="MindWave: Real-Time Mental Health Therapist through GenAI and  Multimodal Interaction", description=description, theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"), allow_flagging=False,css=custom_css)


iface.launch()