engajify commited on
Commit
83c4da6
·
verified ·
1 Parent(s): 35b7b3f

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +21 -4
  2. app.py +146 -0
  3. gitattributes +35 -0
  4. requirements.txt +6 -0
README.md CHANGED
@@ -1,12 +1,29 @@
1
  ---
2
- title: Action Video
3
- emoji: 🐠
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 4.31.5
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Action Detection Video 2
3
+ emoji:
4
+ colorFrom: purple
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 4.31.5
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+
16
+
17
+ # Video Activity Classifier
18
+
19
+ This is a Gradio interface that allows users to upload a video and specify an activity label to check if the activity is present in the video. The app uses a CLIP-based model to classify the video based on the provided label.
20
+
21
+ ## How to Use
22
+
23
+ 1. Upload a video.
24
+ 2. Enter an activity label to detect.
25
+ 3. The app will classify the video and display the results.
26
+
27
+ ## Example
28
+
29
+ For instance, to check if a person is playing basketball in the video, you can enter the label "playing basketball".
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from transformers import AutoProcessor, AutoModel
5
+ from PIL import Image
6
+ import cv2
7
+ from pathlib import Path
8
+ from tempfile import NamedTemporaryFile
9
+
10
+ MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
11
+ CLIP_LEN = 32
12
+
13
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+
15
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
16
+ model = AutoModel.from_pretrained(MODEL_NAME).to(device)
17
+
18
+ def get_video_length(file_path):
19
+ cap = cv2.VideoCapture(file_path)
20
+ length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
21
+ cap.release()
22
+ return length
23
+
24
+ def read_video_opencv(file_path, indices):
25
+ frames = []
26
+ failed_indices = []
27
+
28
+ cap = cv2.VideoCapture(file_path)
29
+ if not cap.isOpened():
30
+ print(f"Error opening video file: {file_path}")
31
+ return frames
32
+
33
+ max_index = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
34
+ for idx in indices:
35
+ if idx <= max_index:
36
+ frame = get_frame_with_opened_cap(cap, idx)
37
+ if frame is not None:
38
+ frames.append(frame)
39
+ else:
40
+ failed_indices.append(idx)
41
+ else:
42
+ failed_indices.append(idx)
43
+ cap.release()
44
+
45
+ if failed_indices:
46
+ print(f"Failed to extract frames at indices: {failed_indices}")
47
+ return frames
48
+
49
+ def get_frame_with_opened_cap(cap, index):
50
+ cap.set(cv2.CAP_PROP_POS_FRAMES, index)
51
+ ret, frame = cap.read()
52
+ if ret:
53
+ return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
54
+ return None
55
+
56
+ def sample_uniform_frame_indices(clip_len, seg_len):
57
+ if seg_len < clip_len:
58
+ repeat_factor = np.ceil(clip_len / seg_len).astype(int)
59
+ indices = np.arange(seg_len).tolist() * repeat_factor
60
+ indices = indices[:clip_len]
61
+ else:
62
+ spacing = seg_len // clip_len
63
+ indices = [i * spacing for i in range(clip_len)]
64
+ return np.array(indices).astype(np.int64)
65
+
66
+ def concatenate_frames(frames, clip_len):
67
+ layout = { 32: (4, 8) }
68
+ rows, cols = layout[clip_len]
69
+ combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
70
+ frame_iter = iter(frames)
71
+ y_offset = 0
72
+ for i in range(rows):
73
+ x_offset = 0
74
+ for j in range(cols):
75
+ img = Image.fromarray(next(frame_iter))
76
+ combined_image.paste(img, (x_offset, y_offset))
77
+ x_offset += frames[0].shape[1]
78
+ y_offset += frames[0].shape[0]
79
+ return combined_image
80
+
81
+ def model_interface(uploaded_video, activity):
82
+ video_length = get_video_length(uploaded_video)
83
+ indices = sample_uniform_frame_indices(CLIP_LEN, seg_len=video_length)
84
+ video = read_video_opencv(uploaded_video, indices)
85
+ concatenated_image = concatenate_frames(video, CLIP_LEN)
86
+
87
+ activities_list = [activity, "other"]
88
+ inputs = processor(
89
+ text=activities_list,
90
+ videos=list(video),
91
+ return_tensors="pt",
92
+ padding=True,
93
+ )
94
+
95
+ for key, value in inputs.items():
96
+ if isinstance(value, torch.Tensor):
97
+ inputs[key] = value.to(device)
98
+
99
+ with torch.no_grad():
100
+ outputs = model(**inputs)
101
+
102
+ logits_per_video = outputs.logits_per_video
103
+ probs = logits_per_video.softmax(dim=1)
104
+
105
+ results_probs = []
106
+ results_logits = []
107
+ max_prob_index = torch.argmax(probs[0]).item()
108
+ for i in range(len(activities_list)):
109
+ current_activity = activities_list[i]
110
+ prob = float(probs[0][i].cpu())
111
+ logit = float(logits_per_video[0][i].cpu())
112
+ results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
113
+ results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))
114
+
115
+ likely_label = activities_list[max_prob_index]
116
+ likely_probability = float(probs[0][max_prob_index].cpu()) * 100
117
+
118
+ return concatenated_image, results_probs, results_logits, [likely_label, likely_probability]
119
+
120
+ iface = gr.Interface(
121
+ fn=model_interface,
122
+ inputs=[
123
+ gr.Video(label="Upload a Video"),
124
+ gr.Textbox(label="Activity to Detect")
125
+ ],
126
+ outputs=[
127
+ gr.Image(label="Concatenated Frames"),
128
+ gr.Dataframe(headers=["Activity", "Probability"], label="Probabilities"),
129
+ gr.Dataframe(headers=["Activity", "Raw Score"], label="Raw Scores"),
130
+ gr.Textbox(label="Most Likely Activity")
131
+ ],
132
+ title="Video Activity Classifier",
133
+ description="""
134
+ **Instructions:**
135
+
136
+ 1. **Upload a Video**: Select a video file to upload.
137
+ 2. **Enter Activity Label**: Specify the activity you want to detect in the video.
138
+ 3. **View Results**:
139
+ - The concatenated frames from the video will be displayed.
140
+ - Probabilities and raw scores for the specified activity and the "other" category will be shown.
141
+ - The most likely activity detected in the video will be displayed.
142
+ """
143
+ )
144
+
145
+ if __name__ == "__main__":
146
+ iface.launch()
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ numpy
5
+ Pillow
6
+ opencv-python