sohojoe commited on
Commit
c6ad8e3
1 Parent(s): 8bf5b8d

video not really work that well

Browse files
__pycache__/clip_transform.cpython-39.pyc ADDED
Binary file (1.9 kB). View file
 
app.py CHANGED
@@ -7,6 +7,7 @@ import numpy as np
7
  import streamlit as st
8
  from streamlit_webrtc import WebRtcMode, webrtc_streamer
9
  import pydub
 
10
  # import av
11
  # import cv2
12
  from sample_utils.turn import get_ice_servers
@@ -23,8 +24,65 @@ system_one = {
23
  "audio_bit_rate": 16000,
24
  # "audio_bit_rate": 32000,
25
  # "audio_bit_rate": 48000,
 
 
 
26
  }
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  playing = st.checkbox("Playing", value=True)
30
 
@@ -94,6 +152,22 @@ async def queued_audio_frames_callback(
94
 
95
  return new_frames
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  webrtc_ctx = webrtc_streamer(
98
  key="charles",
99
  desired_playing_state=playing,
@@ -105,18 +179,31 @@ webrtc_ctx = webrtc_streamer(
105
  async_processing=True,
106
  )
107
 
108
- system_one_audio_status = st.empty()
109
 
110
  if not webrtc_ctx.state.playing:
111
  exit
112
 
113
- system_one_audio_status.write("Initializing...")
114
  system_one_audio_output = st.empty()
 
 
 
115
  system_one_audio_history = []
116
  system_one_audio_history_output = st.empty()
117
 
118
 
119
  sound_chunk = pydub.AudioSegment.empty()
 
 
 
 
 
 
 
 
 
 
 
120
  while True:
121
  if webrtc_ctx.state.playing:
122
  # handle video
@@ -125,6 +212,37 @@ while True:
125
  while len(video_frames_deque) > 0:
126
  frame = video_frames_deque.popleft()
127
  video_frames.append(frame)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  # handle audio
130
  audio_frames = []
 
7
  import streamlit as st
8
  from streamlit_webrtc import WebRtcMode, webrtc_streamer
9
  import pydub
10
+ import torch
11
  # import av
12
  # import cv2
13
  from sample_utils.turn import get_ice_servers
 
24
  "audio_bit_rate": 16000,
25
  # "audio_bit_rate": 32000,
26
  # "audio_bit_rate": 48000,
27
+
28
+ # "vision_embeddings_fps": 5,
29
+ "vision_embeddings_fps": 2,
30
  }
31
 
32
+ system_one["video_detection_emotions"] = [
33
+ "Happiness",
34
+ "Sadness",
35
+ "Fear",
36
+ "Disgust",
37
+ "Anger",
38
+ "Surprise",
39
+ "Boredom",
40
+ "Interest",
41
+ "Excitement",
42
+ "Guilt",
43
+ "Shame",
44
+ "Relief",
45
+ "Love",
46
+ "Embarrassment",
47
+ "Pride",
48
+ "Envy",
49
+ "Jealousy",
50
+ "Anxiety",
51
+ "Hope",
52
+ "Despair",
53
+ "Frustration",
54
+ "Confusion",
55
+ "Curiosity",
56
+ "Contentment",
57
+ "Indifference",
58
+ "Anticipation",
59
+ "Gratitude",
60
+ "Bitterness"
61
+ ]
62
+ system_one["video_detection_engement"] = [
63
+ "Facial_Expressions",
64
+ "Open_Body_Language",
65
+ "Closed_Body_Language",
66
+ "Eye_Contact",
67
+ "Interest",
68
+ "Boredom",
69
+ "Confusion",
70
+ "Frustration",
71
+ "Question_Asking",
72
+ "Engaged_Language",
73
+ "Short_Responses",
74
+ "Distraction_Signs"
75
+ ]
76
+ system_one["video_detection_present"] = [
77
+ "a person",
78
+ "no one",
79
+ " ",
80
+ "multiple people",
81
+ "a group of people",
82
+ ]
83
+
84
+ system_one_audio_status = st.empty()
85
+
86
 
87
  playing = st.checkbox("Playing", value=True)
88
 
 
152
 
153
  return new_frames
154
 
155
+ system_one_audio_status.write("Initializing CLIP model")
156
+ from clip_transform import CLIPTransform
157
+ clip_transform = CLIPTransform()
158
+
159
+ system_one_audio_status.write("Initializing CLIP templates")
160
+
161
+ embeddings = clip_transform.text_to_embeddings(system_one["video_detection_emotions"])
162
+ system_one["video_detection_emotions_embeddings"] = embeddings
163
+
164
+ embeddings = clip_transform.text_to_embeddings(system_one["video_detection_engement"])
165
+ system_one["video_detection_engement_embeddings"] = embeddings
166
+
167
+ embeddings = clip_transform.text_to_embeddings(system_one["video_detection_present"])
168
+ system_one["video_detection_present_embeddings"] = embeddings
169
+
170
+ system_one_audio_status.write("Initializing webrtc_streamer")
171
  webrtc_ctx = webrtc_streamer(
172
  key="charles",
173
  desired_playing_state=playing,
 
179
  async_processing=True,
180
  )
181
 
 
182
 
183
  if not webrtc_ctx.state.playing:
184
  exit
185
 
186
+ system_one_audio_status.write("Initializing streaming")
187
  system_one_audio_output = st.empty()
188
+
189
+ system_one_video_output = st.empty()
190
+
191
  system_one_audio_history = []
192
  system_one_audio_history_output = st.empty()
193
 
194
 
195
  sound_chunk = pydub.AudioSegment.empty()
196
+ current_video_embedding = None
197
+ current_video_embedding_timestamp = time.monotonic()
198
+
199
+
200
+ def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
201
+ dot_product = torch.mm(embeddings, video_embedding.T)
202
+ similarity_image_label = [(float("{:.4f}".format(dot_product[i][0])), embeddings_labels[i]) for i in range(len(embeddings_labels))]
203
+ similarity_image_label.sort(reverse=True)
204
+ return similarity_image_label
205
+
206
+
207
  while True:
208
  if webrtc_ctx.state.playing:
209
  # handle video
 
212
  while len(video_frames_deque) > 0:
213
  frame = video_frames_deque.popleft()
214
  video_frames.append(frame)
215
+ get_embeddings = False
216
+ get_embeddings |= current_video_embedding is None
217
+ current_time = time.monotonic()
218
+ elapsed_time = current_time - current_video_embedding_timestamp
219
+ get_embeddings |= elapsed_time > 1. / system_one['vision_embeddings_fps']
220
+ if get_embeddings and len(video_frames) > 0:
221
+ current_video_embedding_timestamp = current_time
222
+ current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
223
+
224
+ similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
225
+ emotions_top_3 = ""
226
+ for i in range(3):
227
+ emotions_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
228
+ similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
229
+ engagement_top_3 = ""
230
+ for i in range(3):
231
+ engagement_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
232
+ similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
233
+ present_top_3 = ""
234
+ for i in range(3):
235
+ present_top_3 += f"'{similarities[i][1]}' ({similarities[i][0]}), "
236
+
237
+ # table_content = "**System 1 Video:**\n\n"
238
+ table_content = "| System 1 Video | |\n| --- | --- |\n"
239
+ table_content += f"| Present | {present_top_3} |\n"
240
+ table_content += f"| Emotion | {emotions_top_3} |\n"
241
+ table_content += f"| Engagement | {engagement_top_3} |\n"
242
+ system_one_video_output.markdown(table_content)
243
+ # system_one_video_output.markdown(f"**System 1 Video:** \n [Emotion: {emotions_top_3}], \n [Engagement: {engagement_top_3}], \n [Present: {present_top_3}] ")
244
+ # for similarity, image_label in similarity_image_label:
245
+ # print (f"{similarity} {image_label}")
246
 
247
  # handle audio
248
  audio_frames = []
clip_transform.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import numpy as np
4
+ import torch
5
+ from PIL import Image
6
+ from clip_retrieval.load_clip import load_clip, get_tokenizer
7
+ # from clip_retrieval.clip_client import ClipClient, Modality
8
+
9
+ class CLIPTransform:
10
+ def __init__(self):
11
+ # os.environ["OMP_NUM_THREADS"] = "20"
12
+ # torch.set_num_threads(20)
13
+ # Load model
14
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
15
+ # if self.device == "cpu" and torch.backends.mps.is_available():
16
+ # self.device = torch.device("mps")
17
+ # self._clip_model="ViT-L/14"
18
+ self._clip_model="open_clip:ViT-H-14"
19
+ # self._clip_model="open_clip:ViT-L-14"
20
+ # self._clip_model="open_clip:datacomp_xl_s13b_b90k"
21
+ # import open_clip
22
+ # pretrained = dict(open_clip.list_pretrained())
23
+ # checkpoint = pretrained[self._clip_model]
24
+ self.model, self.preprocess = load_clip(self._clip_model, use_jit=True, device=self.device)
25
+ self.tokenizer = get_tokenizer(self._clip_model)
26
+
27
+ print ("using device", self.device)
28
+
29
+ def text_to_embeddings(self, prompts):
30
+ # if prompt is a string, convert to list
31
+ if type(prompts) is str:
32
+ prompts = [prompts]
33
+ text = self.tokenizer(prompts).to(self.device)
34
+ with torch.no_grad():
35
+ prompt_embededdings = self.model.encode_text(text)
36
+ prompt_embededdings /= prompt_embededdings.norm(dim=-1, keepdim=True)
37
+ return(prompt_embededdings)
38
+
39
+ def image_to_embeddings(self, input_im):
40
+ input_im = Image.fromarray(input_im)
41
+ prepro = self.preprocess(input_im).unsqueeze(0).to(self.device)
42
+ with torch.no_grad():
43
+ image_embeddings = self.model.encode_image(prepro)
44
+ image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
45
+ return(image_embeddings)
46
+
47
+ def preprocessed_image_to_emdeddings(self, prepro):
48
+ with torch.no_grad():
49
+ image_embeddings = self.model.encode_image(prepro)
50
+ image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
51
+ return(image_embeddings)
requirements.txt CHANGED
@@ -10,4 +10,7 @@ streamlit_webrtc
10
  twilio
11
  python-dotenv
12
  watchdog
13
- pydub
 
 
 
 
10
  twilio
11
  python-dotenv
12
  watchdog
13
+ pydub
14
+ torch
15
+ numpy
16
+ clip-retrieval == 2.36.1