sohojoe commited on
Commit
c58cbbc
1 Parent(s): c6ad8e3

switch to open_clip

Browse files
__pycache__/clip_transform.cpython-39.pyc CHANGED
Binary files a/__pycache__/clip_transform.cpython-39.pyc and b/__pycache__/clip_transform.cpython-39.pyc differ
 
app.py CHANGED
@@ -29,56 +29,81 @@ system_one = {
29
  "vision_embeddings_fps": 2,
30
  }
31
 
 
32
  system_one["video_detection_emotions"] = [
33
- "Happiness",
34
- "Sadness",
35
- "Fear",
36
- "Disgust",
37
- "Anger",
38
- "Surprise",
39
- "Boredom",
40
- "Interest",
41
- "Excitement",
42
- "Guilt",
43
- "Shame",
44
- "Relief",
45
- "Love",
46
- "Embarrassment",
47
- "Pride",
48
- "Envy",
49
- "Jealousy",
50
- "Anxiety",
51
- "Hope",
52
- "Despair",
53
- "Frustration",
54
- "Confusion",
55
- "Curiosity",
56
- "Contentment",
57
- "Indifference",
58
- "Anticipation",
59
- "Gratitude",
60
- "Bitterness"
61
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  system_one["video_detection_engement"] = [
63
- "Facial_Expressions",
64
- "Open_Body_Language",
65
- "Closed_Body_Language",
66
- "Eye_Contact",
67
- "Interest",
68
- "Boredom",
69
- "Confusion",
70
- "Frustration",
71
- "Question_Asking",
72
- "Engaged_Language",
73
- "Short_Responses",
74
- "Distraction_Signs"
75
  ]
76
  system_one["video_detection_present"] = [
77
- "a person",
78
- "no one",
79
- " ",
80
- "multiple people",
81
- "a group of people",
 
 
 
 
 
 
 
 
 
 
82
  ]
83
 
84
  system_one_audio_status = st.empty()
@@ -203,6 +228,13 @@ def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
203
  similarity_image_label.sort(reverse=True)
204
  return similarity_image_label
205
 
 
 
 
 
 
 
 
206
 
207
  while True:
208
  if webrtc_ctx.state.playing:
@@ -221,18 +253,9 @@ while True:
221
  current_video_embedding_timestamp = current_time
222
  current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
223
 
224
- similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
225
- emotions_top_3 = ""
226
- for i in range(3):
227
- emotions_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
228
- similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
229
- engagement_top_3 = ""
230
- for i in range(3):
231
- engagement_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
232
- similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
233
- present_top_3 = ""
234
- for i in range(3):
235
- present_top_3 += f"'{similarities[i][1]}' ({similarities[i][0]}), "
236
 
237
  # table_content = "**System 1 Video:**\n\n"
238
  table_content = "| System 1 Video | |\n| --- | --- |\n"
 
29
  "vision_embeddings_fps": 2,
30
  }
31
 
32
+
33
  system_one["video_detection_emotions"] = [
34
+ "a happy person",
35
+ "the person is happy",
36
+ "the person's emotional state is happy",
37
+ "a sad person",
38
+ "a scared person",
39
+ "a disgusted person",
40
+ "an angry person",
41
+ "a suprised person",
42
+ "a bored person",
43
+ "an interested person",
44
+ "a guilty person",
45
+ "an indiffert person",
46
+ "a distracted person",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  ]
48
+
49
+
50
+ # system_one["video_detection_emotions"] = [
51
+ # "Happiness",
52
+ # "Sadness",
53
+ # "Fear",
54
+ # "Disgust",
55
+ # "Anger",
56
+ # "Surprise",
57
+ # "Boredom",
58
+ # "Interest",
59
+ # "Excitement",
60
+ # "Guilt",
61
+ # "Shame",
62
+ # "Relief",
63
+ # "Love",
64
+ # "Embarrassment",
65
+ # "Pride",
66
+ # "Envy",
67
+ # "Jealousy",
68
+ # "Anxiety",
69
+ # "Hope",
70
+ # "Despair",
71
+ # "Frustration",
72
+ # "Confusion",
73
+ # "Curiosity",
74
+ # "Contentment",
75
+ # "Indifference",
76
+ # "Anticipation",
77
+ # "Gratitude",
78
+ # "Bitterness"
79
+ # ]
80
  system_one["video_detection_engement"] = [
81
+ "the person is engaged in the conversation",
82
+ "the person is not engaged in the conversation",
83
+ "the person is looking at me",
84
+ "the person is not looking at me",
85
+ "the person is talking to me",
86
+ "the person is not talking to me",
87
+ "the person is engaged",
88
+ "the person is talking",
89
+ "the person is listening",
 
 
 
90
  ]
91
  system_one["video_detection_present"] = [
92
+ "the view from a webcam",
93
+ "the view from a webcam we see a person",
94
+ # "the view from a webcam. I see a person",
95
+ # "the view from a webcam. The person is looking at the camera",
96
+ # "i am a webcam",
97
+ # "i am a webcam and i see a person",
98
+ # "i am a webcam and i see a person. The person is looking at me",
99
+ # "a person",
100
+ # "a person on a Zoom call",
101
+ # "a person on a FaceTime call",
102
+ # "a person on a WebCam call",
103
+ # "no one",
104
+ # " ",
105
+ # "multiple people",
106
+ # "a group of people",
107
  ]
108
 
109
  system_one_audio_status = st.empty()
 
228
  similarity_image_label.sort(reverse=True)
229
  return similarity_image_label
230
 
231
+ def get_top_3_similarities_as_a_string(video_embedding, embeddings, embeddings_labels):
232
+ similarities = get_dot_similarities(video_embedding, embeddings, embeddings_labels)
233
+ top_3 = ""
234
+ range_len = 3 if len(similarities) > 3 else len(similarities)
235
+ for i in range(range_len):
236
+ top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
237
+ return top_3
238
 
239
  while True:
240
  if webrtc_ctx.state.playing:
 
253
  current_video_embedding_timestamp = current_time
254
  current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
255
 
256
+ emotions_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
257
+ engagement_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
258
+ present_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
 
 
 
 
 
 
 
 
 
259
 
260
  # table_content = "**System 1 Video:**\n\n"
261
  table_content = "| System 1 Video | |\n| --- | --- |\n"
clip_transform.py CHANGED
@@ -3,8 +3,7 @@ import os
3
  import numpy as np
4
  import torch
5
  from PIL import Image
6
- from clip_retrieval.load_clip import load_clip, get_tokenizer
7
- # from clip_retrieval.clip_client import ClipClient, Modality
8
 
9
  class CLIPTransform:
10
  def __init__(self):
@@ -14,15 +13,21 @@ class CLIPTransform:
14
  self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
15
  # if self.device == "cpu" and torch.backends.mps.is_available():
16
  # self.device = torch.device("mps")
17
- # self._clip_model="ViT-L/14"
18
- self._clip_model="open_clip:ViT-H-14"
19
- # self._clip_model="open_clip:ViT-L-14"
20
- # self._clip_model="open_clip:datacomp_xl_s13b_b90k"
21
- # import open_clip
22
- # pretrained = dict(open_clip.list_pretrained())
23
- # checkpoint = pretrained[self._clip_model]
24
- self.model, self.preprocess = load_clip(self._clip_model, use_jit=True, device=self.device)
25
- self.tokenizer = get_tokenizer(self._clip_model)
 
 
 
 
 
 
26
 
27
  print ("using device", self.device)
28
 
 
3
  import numpy as np
4
  import torch
5
  from PIL import Image
6
+ import open_clip
 
7
 
8
  class CLIPTransform:
9
  def __init__(self):
 
13
  self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
14
  # if self.device == "cpu" and torch.backends.mps.is_available():
15
  # self.device = torch.device("mps")
16
+
17
+ # # ViT-H-14
18
+ # self._clip_model="ViT-H-14"
19
+ # self._pretrained='laion2B-s32B-b79K'
20
+
21
+ # # ViT-B-32
22
+ # self._clip_model="ViT-B-32"
23
+ # self._pretrained='laion2b_s34b_b79k'
24
+
25
+ # ViT-L/14 1.71gb
26
+ self._clip_model="ViT-L-14"
27
+ self._pretrained='datacomp_xl_s13b_b90k'
28
+
29
+ self.model, _, self.preprocess = open_clip.create_model_and_transforms(self._clip_model, pretrained=self._pretrained)
30
+ self.tokenizer = open_clip.get_tokenizer(self._clip_model)
31
 
32
  print ("using device", self.device)
33
 
debug.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from clip_transform import CLIPTransform
2
+ clip_transform = CLIPTransform()
3
+
4
+ print ("Initializing CLIP templates")
requirements.txt CHANGED
@@ -13,4 +13,4 @@ watchdog
13
  pydub
14
  torch
15
  numpy
16
- clip-retrieval == 2.36.1
 
13
  pydub
14
  torch
15
  numpy
16
+ open_clip_torch