import os import numpy as np from transformers import pipeline import speech_recognition as sr import gradio as gr import cv2 from PIL import Image import moviepy.editor as mp from gtts import gTTS from groq import Groq import re client = Groq( api_key="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", ) # Initialize pipelines image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1) audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim") text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2) conversation_history = [] # max_history_length = 3 def process_input(video_stream, conversation_history): if isinstance(video_stream, str): video_file_path = video_stream # Process video frames image_features_list = [] audio_emotion = "" text_input = "" text_emotions = "" cap = cv2.VideoCapture(video_file_path) frame_count = 0 while True: ret, frame = cap.read() if not ret: break # Convert frame to PIL image pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Analyze the image try: image_analysis = image_pipeline(pil_image) if image_analysis: image_features_list.append(image_analysis[0]['label']) except Exception as e: print(f"Error processing image data: {e}") # Increment frame count frame_count += 1 # Combine image features into a single string image_features = ', '.join(image_features_list) print("Image features:", image_features) # Process audio data and get the emotion label try: # Extract audio from the video file video_clip = mp.VideoFileClip(video_file_path) audio_file_path = os.path.join("/tmp", "audio.wav") video_clip.audio.write_audiofile(audio_file_path) recognizer = sr.Recognizer() with sr.AudioFile(audio_file_path) as source: audio = recognizer.record(source) # Convert audio data to numpy array audio_data = np.frombuffer(audio.frame_data, dtype=np.int16) audio_data = audio_data.astype(np.float32) # Convert to float32 audio_emotions = audio_pipeline(audio_data) if audio_emotions: audio_emotion = audio_emotions[0]['label'] print("Audio emotion:", audio_emotion) # Recognize audio text_input = recognizer.recognize_google(audio) print("User said:", text_input) except Exception as e: print(f"Error processing audio data: {e}") # Process text data and get the emotion label text_emotions = "" try: # Initialize text_input in case it's not set if not text_input: text_input = "" text_analysis = text_pipeline(text_input) print("text analysis:", text_analysis) if isinstance(text_analysis, list): # Flatten the list of lists text_analysis = [item for sublist in text_analysis for item in sublist] # Initialize an empty list to store the text emotions text_emotions_list = [] # Iterate through each item in the flattened list for item in text_analysis: # Ensure each item is a dictionary and contains the 'label' key if isinstance(item, dict) and 'label' in item: # Append the 'label' value to the text_emotions_list text_emotions_list.append(item['label']) # Check if text_emotions_list is empty if text_emotions_list: # Convert the text_emotions_list to a comma-separated string text_emotions = ', '.join(text_emotions_list) print("Text emotions:", text_emotions) else: text_emotions = "No significant emotions detected in the text." except Exception as e: print(f"Error processing text data: {e}") if conversation_history is not None: # conversation_history = conversation_history[-max_history_length:] # Keep most recent entries conversation_history.append({ "user_input": text_input, "image_features": image_features, "audio_emotion": audio_emotion, "text_emotions": text_emotions }) else: conversation_history = [{ "user_input": text_input, "image_features": image_features, "audio_emotion": audio_emotion, "text_emotions": text_emotions }] prompt = "User said: " + text_input if image_features: prompt += "\nImage features: " + ', '.join(image_features) if audio_emotion: prompt += "\nAudio emotion: " + audio_emotion if text_emotions: prompt += "\nText emotions: " + text_emotions # Get conversation history text history_text = display_history(conversation_history) chat_completion = client.chat.completions.create( messages=[ {"role": "system", "content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction." }, {"role": "user", "content": prompt+history_text }, {"role": "assistant", "content": history_text } ], model="llama3-70b-8192", temperature=0.5, max_tokens=1024, top_p=1, stop=None, stream=False, ) ai_response = chat_completion.choices[0].message.content conversation_history.append({"ai_response": ai_response}) print(ai_response) # Convert AI response to audio tts = gTTS(text=ai_response, lang='en') audio_file_path = "/tmp/ai_response.wav" tts.save(audio_file_path) return ai_response,audio_file_path,conversation_history def display_history(conversation_history): history_str = "" for i, turn in enumerate(conversation_history): if "user_input" in turn: history_str += f"User: {turn['user_input']}\n" if "ai_response" in turn: ai_response = turn['ai_response'] ai_response = re.sub(r'\*\*', '', ai_response) history_str += f"Therapist: {turn['ai_response']}\n\n" return history_str # Create the Gradio interface input_video = gr.Video( label="Your Video", include_audio=True) output_text = gr.Textbox(label="Therapist Response") output_audio = gr.Audio(autoplay=True, visible=False) custom_css = """ gr.Interface .gradio-title{ text-align: center; font-size: 24px; font-weight: bold; margin-left:123px; } gr.Interface .gradio-description { text-align: center; font-size: 16px; margin-top: 10px; } """ description = """ Speak to the AI through video input and get personalized responses from our mental health therapist. Whether you need guidance, support, or just someone to talk to, our AI is here to help you navigate life's challenges with empathy and understanding. """ iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text, output_audio], title="MindWave: Real-Time Mental Health Therapist through GenAI and Multimodal Interaction", description=description, theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"), allow_flagging=False,css=custom_css) iface.launch()