Spaces:

sohojoe
/

project_charles

Sleeping

App Files Files Community

sohojoe commited on May 25, 2023

Commit

c6ad8e3

•

1 Parent(s): 8bf5b8d

video not really work that well

Browse files

Files changed (4) hide show

__pycache__/clip_transform.cpython-39.pyc +0 -0
app.py +120 -2
clip_transform.py +51 -0
requirements.txt +4 -1

__pycache__/clip_transform.cpython-39.pyc ADDED Viewed

Binary file (1.9 kB). View file

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import numpy as np
 import streamlit as st
 from streamlit_webrtc import WebRtcMode, webrtc_streamer
 import pydub
 # import av
 # import cv2
 from sample_utils.turn import get_ice_servers
@@ -23,8 +24,65 @@ system_one = {
     "audio_bit_rate": 16000,
     # "audio_bit_rate": 32000,
     # "audio_bit_rate": 48000,
 }
 playing = st.checkbox("Playing", value=True)
@@ -94,6 +152,22 @@ async def queued_audio_frames_callback(
     return new_frames
 webrtc_ctx = webrtc_streamer(
     key="charles",
     desired_playing_state=playing,
@@ -105,18 +179,31 @@ webrtc_ctx = webrtc_streamer(
     async_processing=True,
 )
-system_one_audio_status = st.empty()
 if not webrtc_ctx.state.playing:
     exit
-system_one_audio_status.write("Initializing...")
 system_one_audio_output = st.empty()
 system_one_audio_history = []
 system_one_audio_history_output = st.empty()
 sound_chunk = pydub.AudioSegment.empty()
 while True:
     if webrtc_ctx.state.playing:
         # handle video
@@ -125,6 +212,37 @@ while True:
             while len(video_frames_deque) > 0:
                 frame = video_frames_deque.popleft()
                 video_frames.append(frame)
         # handle audio
         audio_frames = []

 import streamlit as st
 from streamlit_webrtc import WebRtcMode, webrtc_streamer
 import pydub
+import torch
 # import av
 # import cv2
 from sample_utils.turn import get_ice_servers
     "audio_bit_rate": 16000,
     # "audio_bit_rate": 32000,
     # "audio_bit_rate": 48000,
+    # "vision_embeddings_fps": 5,
+    "vision_embeddings_fps": 2,
 }
+system_one["video_detection_emotions"] = [
+    "Happiness",
+    "Sadness",
+    "Fear",
+    "Disgust",
+    "Anger",
+    "Surprise",
+    "Boredom",
+    "Interest",
+    "Excitement",
+    "Guilt",
+    "Shame",
+    "Relief",
+    "Love",
+    "Embarrassment",
+    "Pride",
+    "Envy",
+    "Jealousy",
+    "Anxiety",
+    "Hope",
+    "Despair",
+    "Frustration",
+    "Confusion",
+    "Curiosity",
+    "Contentment",
+    "Indifference",
+    "Anticipation",
+    "Gratitude",
+    "Bitterness"
+]
+system_one["video_detection_engement"] = [
+    "Facial_Expressions",
+    "Open_Body_Language",
+    "Closed_Body_Language",
+    "Eye_Contact",
+    "Interest",
+    "Boredom",
+    "Confusion",
+    "Frustration",
+    "Question_Asking",
+    "Engaged_Language",
+    "Short_Responses",
+    "Distraction_Signs"
+]
+system_one["video_detection_present"] = [
+    "a person",
+    "no one",
+    " ",
+    "multiple people",
+    "a group of people",
+]
+system_one_audio_status = st.empty()
 playing = st.checkbox("Playing", value=True)
     return new_frames
+system_one_audio_status.write("Initializing CLIP model")
+from clip_transform import CLIPTransform
+clip_transform = CLIPTransform()
+system_one_audio_status.write("Initializing CLIP templates")
+embeddings = clip_transform.text_to_embeddings(system_one["video_detection_emotions"])
+system_one["video_detection_emotions_embeddings"] = embeddings
+embeddings = clip_transform.text_to_embeddings(system_one["video_detection_engement"])
+system_one["video_detection_engement_embeddings"] = embeddings
+embeddings = clip_transform.text_to_embeddings(system_one["video_detection_present"])
+system_one["video_detection_present_embeddings"] = embeddings
+system_one_audio_status.write("Initializing webrtc_streamer")
 webrtc_ctx = webrtc_streamer(
     key="charles",
     desired_playing_state=playing,
     async_processing=True,
 )
 if not webrtc_ctx.state.playing:
     exit
+system_one_audio_status.write("Initializing streaming")
 system_one_audio_output = st.empty()
+system_one_video_output = st.empty()
 system_one_audio_history = []
 system_one_audio_history_output = st.empty()
 sound_chunk = pydub.AudioSegment.empty()
+current_video_embedding = None
+current_video_embedding_timestamp = time.monotonic()
+def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
+    dot_product = torch.mm(embeddings, video_embedding.T)
+    similarity_image_label = [(float("{:.4f}".format(dot_product[i][0])), embeddings_labels[i]) for i in range(len(embeddings_labels))]
+    similarity_image_label.sort(reverse=True)
+    return similarity_image_label
 while True:
     if webrtc_ctx.state.playing:
         # handle video
             while len(video_frames_deque) > 0:
                 frame = video_frames_deque.popleft()
                 video_frames.append(frame)
+        get_embeddings = False
+        get_embeddings |= current_video_embedding is None
+        current_time = time.monotonic()
+        elapsed_time = current_time - current_video_embedding_timestamp
+        get_embeddings |= elapsed_time > 1. / system_one['vision_embeddings_fps']
+        if get_embeddings and len(video_frames) > 0:
+            current_video_embedding_timestamp = current_time
+            current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
+            similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
+            emotions_top_3 = ""
+            for i in range(3):
+                emotions_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
+            similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
+            engagement_top_3 = ""
+            for i in range(3):
+                engagement_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
+            similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
+            present_top_3 = ""
+            for i in range(3):
+                present_top_3 += f"'{similarities[i][1]}' ({similarities[i][0]}), "
+            # table_content = "**System 1 Video:**\n\n"
+            table_content = "| System 1 Video |    |\n| --- | --- |\n"
+            table_content += f"| Present | {present_top_3} |\n"
+            table_content += f"| Emotion | {emotions_top_3} |\n"
+            table_content += f"| Engagement | {engagement_top_3} |\n"
+            system_one_video_output.markdown(table_content)
+            # system_one_video_output.markdown(f"**System 1 Video:** \n [Emotion: {emotions_top_3}], \n [Engagement: {engagement_top_3}], \n [Present: {present_top_3}] ")
+            # for similarity, image_label in similarity_image_label:
+            #     print (f"{similarity} {image_label}")
         # handle audio
         audio_frames = []

clip_transform.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import json
+import os
+import numpy as np
+import torch
+from PIL import Image
+from clip_retrieval.load_clip import load_clip, get_tokenizer
+# from clip_retrieval.clip_client import ClipClient, Modality
+class CLIPTransform:
+    def __init__(self):
+        # os.environ["OMP_NUM_THREADS"] = "20"
+        # torch.set_num_threads(20)
+        # Load model
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        # if self.device == "cpu" and torch.backends.mps.is_available():
+        #     self.device = torch.device("mps")
+        # self._clip_model="ViT-L/14"
+        self._clip_model="open_clip:ViT-H-14"
+        # self._clip_model="open_clip:ViT-L-14"
+        # self._clip_model="open_clip:datacomp_xl_s13b_b90k"
+        # import open_clip
+        # pretrained = dict(open_clip.list_pretrained())
+        # checkpoint = pretrained[self._clip_model]
+        self.model, self.preprocess = load_clip(self._clip_model, use_jit=True, device=self.device)
+        self.tokenizer = get_tokenizer(self._clip_model)
+        print ("using device", self.device)
+    def text_to_embeddings(self, prompts):
+        # if prompt is a string, convert to list
+        if type(prompts) is str:
+            prompts = [prompts]
+        text = self.tokenizer(prompts).to(self.device)
+        with torch.no_grad():
+            prompt_embededdings = self.model.encode_text(text)
+        prompt_embededdings /= prompt_embededdings.norm(dim=-1, keepdim=True)
+        return(prompt_embededdings)
+    def image_to_embeddings(self, input_im):
+        input_im = Image.fromarray(input_im)
+        prepro = self.preprocess(input_im).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            image_embeddings = self.model.encode_image(prepro)
+        image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
+        return(image_embeddings)
+    def preprocessed_image_to_emdeddings(self, prepro):
+        with torch.no_grad():
+            image_embeddings = self.model.encode_image(prepro)
+        image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
+        return(image_embeddings)

requirements.txt CHANGED Viewed

@@ -10,4 +10,7 @@ streamlit_webrtc
 twilio
 python-dotenv
 watchdog
-pydub

 twilio
 python-dotenv
 watchdog
+pydub
+torch
+numpy
+clip-retrieval == 2.36.1