IbrahimHasani commited on
Commit
2dc6183
1 Parent(s): 8d1f721

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -18
app.py CHANGED
@@ -3,18 +3,15 @@ import torch
3
  import numpy as np
4
  from transformers import AutoProcessor, AutoModel
5
  from PIL import Image
6
- import cv2
7
 
8
- # Constants
9
  MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
10
  CLIP_LEN = 32
11
 
12
- # Check for GPU and set device
13
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
-
15
- # Load model and processor
16
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
17
- model = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()
 
18
 
19
  def get_video_length(file_path):
20
  cap = cv2.VideoCapture(file_path)
@@ -25,8 +22,8 @@ def get_video_length(file_path):
25
  def read_video_opencv(file_path, indices):
26
  cap = cv2.VideoCapture(file_path)
27
  frames = []
28
- for idx in indices:
29
- cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
30
  ret, frame = cap.read()
31
  if ret:
32
  frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
@@ -43,20 +40,22 @@ def sample_uniform_frame_indices(clip_len, seg_len):
43
  indices = [i * spacing for i in range(clip_len)]
44
  return np.array(indices).astype(np.int64)
45
 
46
- def get_concatenation_layout(clip_len):
47
- # Modify as needed for other clip lengths
48
- if clip_len == 32:
49
- return 4, 8
50
-
51
  def concatenate_frames(frames, clip_len):
52
- rows, cols = get_concatenation_layout(clip_len)
 
53
  combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
54
  frame_iter = iter(frames)
55
  y_offset = 0
56
  for i in range(rows):
57
  x_offset = 0
58
  for j in range(cols):
59
- img = Image.fromarray(next(frame_iter))
 
 
 
 
 
 
60
  combined_image.paste(img, (x_offset, y_offset))
61
  x_offset += frames[0].shape[1]
62
  y_offset += frames[0].shape[0]
@@ -74,7 +73,7 @@ def model_interface(uploaded_video, activity):
74
  videos=list(video),
75
  return_tensors="pt",
76
  padding=True,
77
- ).to(device) # Move inputs to GPU if available
78
 
79
  with torch.no_grad():
80
  outputs = model(**inputs)
@@ -95,7 +94,7 @@ def model_interface(uploaded_video, activity):
95
  likely_label = activities_list[max_prob_index]
96
  likely_probability = float(probs[0][max_prob_index]) * 100
97
 
98
- return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]
99
 
100
  iface = gr.Interface(
101
  fn=model_interface,
 
3
  import numpy as np
4
  from transformers import AutoProcessor, AutoModel
5
  from PIL import Image
6
+ import cv2
7
 
 
8
  MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
9
  CLIP_LEN = 32
10
 
11
+ # Load model and processor once
 
 
 
12
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
13
+ model = AutoModel.from_pretrained(MODEL_NAME)
14
+
15
 
16
  def get_video_length(file_path):
17
  cap = cv2.VideoCapture(file_path)
 
22
  def read_video_opencv(file_path, indices):
23
  cap = cv2.VideoCapture(file_path)
24
  frames = []
25
+ for i in indices:
26
+ cap.set(cv2.CAP_PROP_POS_FRAMES, i)
27
  ret, frame = cap.read()
28
  if ret:
29
  frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
 
40
  indices = [i * spacing for i in range(clip_len)]
41
  return np.array(indices).astype(np.int64)
42
 
 
 
 
 
 
43
  def concatenate_frames(frames, clip_len):
44
+ layout = { 32: (4, 8) }
45
+ rows, cols = layout[clip_len]
46
  combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
47
  frame_iter = iter(frames)
48
  y_offset = 0
49
  for i in range(rows):
50
  x_offset = 0
51
  for j in range(cols):
52
+ img_array = next(frame_iter)
53
+
54
+ # Handling rank-4 tensor
55
+ if len(img_array.shape) == 4:
56
+ img_array = img_array[0]
57
+
58
+ img = Image.fromarray(img_array)
59
  combined_image.paste(img, (x_offset, y_offset))
60
  x_offset += frames[0].shape[1]
61
  y_offset += frames[0].shape[0]
 
73
  videos=list(video),
74
  return_tensors="pt",
75
  padding=True,
76
+ )
77
 
78
  with torch.no_grad():
79
  outputs = model(**inputs)
 
94
  likely_label = activities_list[max_prob_index]
95
  likely_probability = float(probs[0][max_prob_index]) * 100
96
 
97
+ return concatenated_image, results_probs, results_logits, [likely_label, likely_probability]
98
 
99
  iface = gr.Interface(
100
  fn=model_interface,