Spaces:
Runtime error
Runtime error
video not really work that well
Browse files- __pycache__/clip_transform.cpython-39.pyc +0 -0
- app.py +120 -2
- clip_transform.py +51 -0
- requirements.txt +4 -1
__pycache__/clip_transform.cpython-39.pyc
ADDED
|
Binary file (1.9 kB). View file
|
|
|
app.py
CHANGED
|
@@ -7,6 +7,7 @@ import numpy as np
|
|
| 7 |
import streamlit as st
|
| 8 |
from streamlit_webrtc import WebRtcMode, webrtc_streamer
|
| 9 |
import pydub
|
|
|
|
| 10 |
# import av
|
| 11 |
# import cv2
|
| 12 |
from sample_utils.turn import get_ice_servers
|
|
@@ -23,8 +24,65 @@ system_one = {
|
|
| 23 |
"audio_bit_rate": 16000,
|
| 24 |
# "audio_bit_rate": 32000,
|
| 25 |
# "audio_bit_rate": 48000,
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
playing = st.checkbox("Playing", value=True)
|
| 30 |
|
|
@@ -94,6 +152,22 @@ async def queued_audio_frames_callback(
|
|
| 94 |
|
| 95 |
return new_frames
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
webrtc_ctx = webrtc_streamer(
|
| 98 |
key="charles",
|
| 99 |
desired_playing_state=playing,
|
|
@@ -105,18 +179,31 @@ webrtc_ctx = webrtc_streamer(
|
|
| 105 |
async_processing=True,
|
| 106 |
)
|
| 107 |
|
| 108 |
-
system_one_audio_status = st.empty()
|
| 109 |
|
| 110 |
if not webrtc_ctx.state.playing:
|
| 111 |
exit
|
| 112 |
|
| 113 |
-
system_one_audio_status.write("Initializing
|
| 114 |
system_one_audio_output = st.empty()
|
|
|
|
|
|
|
|
|
|
| 115 |
system_one_audio_history = []
|
| 116 |
system_one_audio_history_output = st.empty()
|
| 117 |
|
| 118 |
|
| 119 |
sound_chunk = pydub.AudioSegment.empty()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
while True:
|
| 121 |
if webrtc_ctx.state.playing:
|
| 122 |
# handle video
|
|
@@ -125,6 +212,37 @@ while True:
|
|
| 125 |
while len(video_frames_deque) > 0:
|
| 126 |
frame = video_frames_deque.popleft()
|
| 127 |
video_frames.append(frame)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# handle audio
|
| 130 |
audio_frames = []
|
|
|
|
| 7 |
import streamlit as st
|
| 8 |
from streamlit_webrtc import WebRtcMode, webrtc_streamer
|
| 9 |
import pydub
|
| 10 |
+
import torch
|
| 11 |
# import av
|
| 12 |
# import cv2
|
| 13 |
from sample_utils.turn import get_ice_servers
|
|
|
|
| 24 |
"audio_bit_rate": 16000,
|
| 25 |
# "audio_bit_rate": 32000,
|
| 26 |
# "audio_bit_rate": 48000,
|
| 27 |
+
|
| 28 |
+
# "vision_embeddings_fps": 5,
|
| 29 |
+
"vision_embeddings_fps": 2,
|
| 30 |
}
|
| 31 |
|
| 32 |
+
system_one["video_detection_emotions"] = [
|
| 33 |
+
"Happiness",
|
| 34 |
+
"Sadness",
|
| 35 |
+
"Fear",
|
| 36 |
+
"Disgust",
|
| 37 |
+
"Anger",
|
| 38 |
+
"Surprise",
|
| 39 |
+
"Boredom",
|
| 40 |
+
"Interest",
|
| 41 |
+
"Excitement",
|
| 42 |
+
"Guilt",
|
| 43 |
+
"Shame",
|
| 44 |
+
"Relief",
|
| 45 |
+
"Love",
|
| 46 |
+
"Embarrassment",
|
| 47 |
+
"Pride",
|
| 48 |
+
"Envy",
|
| 49 |
+
"Jealousy",
|
| 50 |
+
"Anxiety",
|
| 51 |
+
"Hope",
|
| 52 |
+
"Despair",
|
| 53 |
+
"Frustration",
|
| 54 |
+
"Confusion",
|
| 55 |
+
"Curiosity",
|
| 56 |
+
"Contentment",
|
| 57 |
+
"Indifference",
|
| 58 |
+
"Anticipation",
|
| 59 |
+
"Gratitude",
|
| 60 |
+
"Bitterness"
|
| 61 |
+
]
|
| 62 |
+
system_one["video_detection_engement"] = [
|
| 63 |
+
"Facial_Expressions",
|
| 64 |
+
"Open_Body_Language",
|
| 65 |
+
"Closed_Body_Language",
|
| 66 |
+
"Eye_Contact",
|
| 67 |
+
"Interest",
|
| 68 |
+
"Boredom",
|
| 69 |
+
"Confusion",
|
| 70 |
+
"Frustration",
|
| 71 |
+
"Question_Asking",
|
| 72 |
+
"Engaged_Language",
|
| 73 |
+
"Short_Responses",
|
| 74 |
+
"Distraction_Signs"
|
| 75 |
+
]
|
| 76 |
+
system_one["video_detection_present"] = [
|
| 77 |
+
"a person",
|
| 78 |
+
"no one",
|
| 79 |
+
" ",
|
| 80 |
+
"multiple people",
|
| 81 |
+
"a group of people",
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
system_one_audio_status = st.empty()
|
| 85 |
+
|
| 86 |
|
| 87 |
playing = st.checkbox("Playing", value=True)
|
| 88 |
|
|
|
|
| 152 |
|
| 153 |
return new_frames
|
| 154 |
|
| 155 |
+
system_one_audio_status.write("Initializing CLIP model")
|
| 156 |
+
from clip_transform import CLIPTransform
|
| 157 |
+
clip_transform = CLIPTransform()
|
| 158 |
+
|
| 159 |
+
system_one_audio_status.write("Initializing CLIP templates")
|
| 160 |
+
|
| 161 |
+
embeddings = clip_transform.text_to_embeddings(system_one["video_detection_emotions"])
|
| 162 |
+
system_one["video_detection_emotions_embeddings"] = embeddings
|
| 163 |
+
|
| 164 |
+
embeddings = clip_transform.text_to_embeddings(system_one["video_detection_engement"])
|
| 165 |
+
system_one["video_detection_engement_embeddings"] = embeddings
|
| 166 |
+
|
| 167 |
+
embeddings = clip_transform.text_to_embeddings(system_one["video_detection_present"])
|
| 168 |
+
system_one["video_detection_present_embeddings"] = embeddings
|
| 169 |
+
|
| 170 |
+
system_one_audio_status.write("Initializing webrtc_streamer")
|
| 171 |
webrtc_ctx = webrtc_streamer(
|
| 172 |
key="charles",
|
| 173 |
desired_playing_state=playing,
|
|
|
|
| 179 |
async_processing=True,
|
| 180 |
)
|
| 181 |
|
|
|
|
| 182 |
|
| 183 |
if not webrtc_ctx.state.playing:
|
| 184 |
exit
|
| 185 |
|
| 186 |
+
system_one_audio_status.write("Initializing streaming")
|
| 187 |
system_one_audio_output = st.empty()
|
| 188 |
+
|
| 189 |
+
system_one_video_output = st.empty()
|
| 190 |
+
|
| 191 |
system_one_audio_history = []
|
| 192 |
system_one_audio_history_output = st.empty()
|
| 193 |
|
| 194 |
|
| 195 |
sound_chunk = pydub.AudioSegment.empty()
|
| 196 |
+
current_video_embedding = None
|
| 197 |
+
current_video_embedding_timestamp = time.monotonic()
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
|
| 201 |
+
dot_product = torch.mm(embeddings, video_embedding.T)
|
| 202 |
+
similarity_image_label = [(float("{:.4f}".format(dot_product[i][0])), embeddings_labels[i]) for i in range(len(embeddings_labels))]
|
| 203 |
+
similarity_image_label.sort(reverse=True)
|
| 204 |
+
return similarity_image_label
|
| 205 |
+
|
| 206 |
+
|
| 207 |
while True:
|
| 208 |
if webrtc_ctx.state.playing:
|
| 209 |
# handle video
|
|
|
|
| 212 |
while len(video_frames_deque) > 0:
|
| 213 |
frame = video_frames_deque.popleft()
|
| 214 |
video_frames.append(frame)
|
| 215 |
+
get_embeddings = False
|
| 216 |
+
get_embeddings |= current_video_embedding is None
|
| 217 |
+
current_time = time.monotonic()
|
| 218 |
+
elapsed_time = current_time - current_video_embedding_timestamp
|
| 219 |
+
get_embeddings |= elapsed_time > 1. / system_one['vision_embeddings_fps']
|
| 220 |
+
if get_embeddings and len(video_frames) > 0:
|
| 221 |
+
current_video_embedding_timestamp = current_time
|
| 222 |
+
current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
|
| 223 |
+
|
| 224 |
+
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
|
| 225 |
+
emotions_top_3 = ""
|
| 226 |
+
for i in range(3):
|
| 227 |
+
emotions_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
| 228 |
+
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
|
| 229 |
+
engagement_top_3 = ""
|
| 230 |
+
for i in range(3):
|
| 231 |
+
engagement_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
| 232 |
+
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
|
| 233 |
+
present_top_3 = ""
|
| 234 |
+
for i in range(3):
|
| 235 |
+
present_top_3 += f"'{similarities[i][1]}' ({similarities[i][0]}), "
|
| 236 |
+
|
| 237 |
+
# table_content = "**System 1 Video:**\n\n"
|
| 238 |
+
table_content = "| System 1 Video | |\n| --- | --- |\n"
|
| 239 |
+
table_content += f"| Present | {present_top_3} |\n"
|
| 240 |
+
table_content += f"| Emotion | {emotions_top_3} |\n"
|
| 241 |
+
table_content += f"| Engagement | {engagement_top_3} |\n"
|
| 242 |
+
system_one_video_output.markdown(table_content)
|
| 243 |
+
# system_one_video_output.markdown(f"**System 1 Video:** \n [Emotion: {emotions_top_3}], \n [Engagement: {engagement_top_3}], \n [Present: {present_top_3}] ")
|
| 244 |
+
# for similarity, image_label in similarity_image_label:
|
| 245 |
+
# print (f"{similarity} {image_label}")
|
| 246 |
|
| 247 |
# handle audio
|
| 248 |
audio_frames = []
|
clip_transform.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from clip_retrieval.load_clip import load_clip, get_tokenizer
|
| 7 |
+
# from clip_retrieval.clip_client import ClipClient, Modality
|
| 8 |
+
|
| 9 |
+
class CLIPTransform:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
# os.environ["OMP_NUM_THREADS"] = "20"
|
| 12 |
+
# torch.set_num_threads(20)
|
| 13 |
+
# Load model
|
| 14 |
+
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 15 |
+
# if self.device == "cpu" and torch.backends.mps.is_available():
|
| 16 |
+
# self.device = torch.device("mps")
|
| 17 |
+
# self._clip_model="ViT-L/14"
|
| 18 |
+
self._clip_model="open_clip:ViT-H-14"
|
| 19 |
+
# self._clip_model="open_clip:ViT-L-14"
|
| 20 |
+
# self._clip_model="open_clip:datacomp_xl_s13b_b90k"
|
| 21 |
+
# import open_clip
|
| 22 |
+
# pretrained = dict(open_clip.list_pretrained())
|
| 23 |
+
# checkpoint = pretrained[self._clip_model]
|
| 24 |
+
self.model, self.preprocess = load_clip(self._clip_model, use_jit=True, device=self.device)
|
| 25 |
+
self.tokenizer = get_tokenizer(self._clip_model)
|
| 26 |
+
|
| 27 |
+
print ("using device", self.device)
|
| 28 |
+
|
| 29 |
+
def text_to_embeddings(self, prompts):
|
| 30 |
+
# if prompt is a string, convert to list
|
| 31 |
+
if type(prompts) is str:
|
| 32 |
+
prompts = [prompts]
|
| 33 |
+
text = self.tokenizer(prompts).to(self.device)
|
| 34 |
+
with torch.no_grad():
|
| 35 |
+
prompt_embededdings = self.model.encode_text(text)
|
| 36 |
+
prompt_embededdings /= prompt_embededdings.norm(dim=-1, keepdim=True)
|
| 37 |
+
return(prompt_embededdings)
|
| 38 |
+
|
| 39 |
+
def image_to_embeddings(self, input_im):
|
| 40 |
+
input_im = Image.fromarray(input_im)
|
| 41 |
+
prepro = self.preprocess(input_im).unsqueeze(0).to(self.device)
|
| 42 |
+
with torch.no_grad():
|
| 43 |
+
image_embeddings = self.model.encode_image(prepro)
|
| 44 |
+
image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
|
| 45 |
+
return(image_embeddings)
|
| 46 |
+
|
| 47 |
+
def preprocessed_image_to_emdeddings(self, prepro):
|
| 48 |
+
with torch.no_grad():
|
| 49 |
+
image_embeddings = self.model.encode_image(prepro)
|
| 50 |
+
image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
|
| 51 |
+
return(image_embeddings)
|
requirements.txt
CHANGED
|
@@ -10,4 +10,7 @@ streamlit_webrtc
|
|
| 10 |
twilio
|
| 11 |
python-dotenv
|
| 12 |
watchdog
|
| 13 |
-
pydub
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
twilio
|
| 11 |
python-dotenv
|
| 12 |
watchdog
|
| 13 |
+
pydub
|
| 14 |
+
torch
|
| 15 |
+
numpy
|
| 16 |
+
clip-retrieval == 2.36.1
|