nam_nguyenhoai_AI
commited on
Commit
•
b0a48de
1
Parent(s):
987b643
Update algorithm
Browse files- .gitignore +2 -0
- algorithm.py +118 -0
- app.py +123 -4
- utils.py +77 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.mp4
|
2 |
+
assets/examples_Video
|
algorithm.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import faiss
|
2 |
+
from sklearn.metrics import pairwise_distances_argmin_min
|
3 |
+
import random
|
4 |
+
import numpy as np
|
5 |
+
from utils import *
|
6 |
+
|
7 |
+
def kmeans(number_of_clusters, features):
|
8 |
+
# Cluster the frames using K-Means
|
9 |
+
|
10 |
+
# K-means from sklearn
|
11 |
+
#kmeans = KMeans(n_clusters=number_of_clusters, random_state=0).fit(features)
|
12 |
+
|
13 |
+
# K-means from faiss
|
14 |
+
ncentroids = number_of_clusters
|
15 |
+
niter = 10
|
16 |
+
verbose = True
|
17 |
+
x = features
|
18 |
+
|
19 |
+
# Take the first dimension of the first element of the list
|
20 |
+
dimension = x[0].shape[0]
|
21 |
+
|
22 |
+
kmeans = faiss.Kmeans(dimension, ncentroids, niter=niter, verbose=verbose)
|
23 |
+
kmeans.train(x)
|
24 |
+
|
25 |
+
#closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, features)
|
26 |
+
closest, _ = pairwise_distances_argmin_min(kmeans.centroids, x)
|
27 |
+
|
28 |
+
closest_clips_frames = []
|
29 |
+
|
30 |
+
for i in sorted(closest):
|
31 |
+
for idx in range(i*8, (i+1)*8):
|
32 |
+
closest_clips_frames.append(idx)
|
33 |
+
|
34 |
+
return closest_clips_frames
|
35 |
+
|
36 |
+
def tt01(features, threshold):
|
37 |
+
|
38 |
+
i = 0
|
39 |
+
clips = []
|
40 |
+
|
41 |
+
# compare the sum of squared difference between clips i and j
|
42 |
+
for j in range(1, len(features)):
|
43 |
+
if sum_of_squared_difference(features[i], features[j]) > threshold:
|
44 |
+
clip = []
|
45 |
+
|
46 |
+
# add frames from clip i to j-1 to the clip list
|
47 |
+
for b in range(i*8, j*8):
|
48 |
+
clip.append(b)
|
49 |
+
|
50 |
+
# randomly select 15% of the frames from the clip list
|
51 |
+
random_num = round(len(clip)*0.15)
|
52 |
+
|
53 |
+
# sort the frames in the clip list to ensure the order of the frames
|
54 |
+
random_Frames = sorted(random.sample(clip, random_num))
|
55 |
+
i = j
|
56 |
+
clips.extend(random_Frames)
|
57 |
+
|
58 |
+
# add the last clip to the clip list
|
59 |
+
clip = []
|
60 |
+
if i==j:
|
61 |
+
for c in range(j*8, j*8+8):
|
62 |
+
clip.append(c)
|
63 |
+
random_num = round(len(clip)*0.15)
|
64 |
+
random_Frames = sorted(random.sample(clip, random_num))
|
65 |
+
#print("i == j")
|
66 |
+
|
67 |
+
else: # (i<j)
|
68 |
+
for c in range(i*8, (j+1)*8):
|
69 |
+
clip.append(c)
|
70 |
+
random_num = round(len(clip)*0.15)
|
71 |
+
random_Frames = sorted(random.sample(clip, random_num))
|
72 |
+
#print(f"{i} with {j}")
|
73 |
+
|
74 |
+
clips.extend(random_Frames)
|
75 |
+
|
76 |
+
return clips
|
77 |
+
|
78 |
+
def tt02(features, threshold):
|
79 |
+
|
80 |
+
i = 0
|
81 |
+
previous = i
|
82 |
+
clips = []
|
83 |
+
|
84 |
+
#compare the sum of squared difference between clips j and previous
|
85 |
+
for j in range(1, len(features)):
|
86 |
+
if sum_of_squared_difference(features[previous], features[j]) > threshold:
|
87 |
+
clip = []
|
88 |
+
|
89 |
+
# add frames from clip i to j-1 to the clip list
|
90 |
+
for b in range(i*8, j*8):
|
91 |
+
clip.append(b)
|
92 |
+
|
93 |
+
# randomly select 15% of the frames from the clip list
|
94 |
+
random_num = round(len(clip)*0.15)
|
95 |
+
# sort the frames in the clip list to ensure the order of the frames
|
96 |
+
random_Frames = sorted(random.sample(clip, random_num))
|
97 |
+
i = j
|
98 |
+
clips.extend(random_Frames)
|
99 |
+
|
100 |
+
previous = j
|
101 |
+
|
102 |
+
# add the last clip to the clip list
|
103 |
+
clip = []
|
104 |
+
if i==j:
|
105 |
+
for c in range(j*8, j*8+8):
|
106 |
+
clip.append(c)
|
107 |
+
random_num = round(len(clip)*0.15)
|
108 |
+
random_Frames = sorted(random.sample(clip, random_num))
|
109 |
+
|
110 |
+
else: # (i<j)
|
111 |
+
for c in range(i*8, (j+1)*8):
|
112 |
+
clip.append(c)
|
113 |
+
random_num = round(len(clip)*0.15)
|
114 |
+
random_Frames = sorted(random.sample(clip, random_num))
|
115 |
+
|
116 |
+
clips.extend(random_Frames)
|
117 |
+
|
118 |
+
return clips
|
app.py
CHANGED
@@ -1,6 +1,126 @@
|
|
1 |
import gradio as gr
|
2 |
import cv2
|
3 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
css = """
|
6 |
#img-display-container {
|
@@ -14,7 +134,6 @@ css = """
|
|
14 |
}
|
15 |
"""
|
16 |
|
17 |
-
|
18 |
title = "# Video Summarization Demo"
|
19 |
description = """Video Summarization using Timesformer.
|
20 |
|
@@ -28,18 +147,18 @@ with gr.Blocks(css=css) as demo:
|
|
28 |
|
29 |
with gr.Row():
|
30 |
input_video = gr.Video(label="Input Video")
|
31 |
-
|
32 |
submit = gr.Button("Submit")
|
33 |
processed_video = gr.Video(label="Summarized Video")
|
34 |
|
35 |
-
def on_submit(uploaded_video,
|
36 |
|
37 |
# Process the video and get the path of the output video
|
38 |
#output_video_path = make_video(uploaded_video,encoder=model_type)
|
39 |
pass
|
40 |
#return output_video_path
|
41 |
|
42 |
-
submit.click(on_submit, inputs=[input_video,
|
43 |
|
44 |
#example_files = os.listdir('assets/examples_video')
|
45 |
#example_files.sort()
|
|
|
1 |
import gradio as gr
|
2 |
import cv2
|
3 |
import os
|
4 |
+
import spaces
|
5 |
+
import tempfile
|
6 |
+
from torchvision import transforms
|
7 |
+
from torchvision.transforms import Compose
|
8 |
+
import torch
|
9 |
+
import numpy as np
|
10 |
+
from PIL import Image
|
11 |
+
import torch.nn.functional as F
|
12 |
+
from pytorchvideo.transforms.functional import predict_depth
|
13 |
+
from transformers import pipeline, TimesformerModel, VideoMAEImageProcessor
|
14 |
+
from utils import *
|
15 |
+
from algorithm import *
|
16 |
+
|
17 |
+
@spaces.GPU
|
18 |
+
def make_video(video_path, outdir='./summarized_video',encoder='Kmeans'):
|
19 |
+
if encoder not in ["Kmeans", "Sum of Squared Difference 01", "Sum of Squared Difference 02"]:
|
20 |
+
encoder = "Kmeans"
|
21 |
+
# nen them vao cac truong hop mo hinh khac
|
22 |
+
margin_width = 50
|
23 |
+
|
24 |
+
model, processor, device = load_model()
|
25 |
+
|
26 |
+
# total_params = sum(param.numel() for param in model.parameters())
|
27 |
+
# print('Total parameters: {:.2f}M'.format(total_params / 1e6))
|
28 |
+
|
29 |
+
if os.path.isfile(video_path):
|
30 |
+
if video_path.endswith('txt'):
|
31 |
+
with open(video_path, 'r') as f:
|
32 |
+
lines = f.read().splitlines()
|
33 |
+
else:
|
34 |
+
filenames = [video_path]
|
35 |
+
else:
|
36 |
+
filenames = os.listdir(video_path)
|
37 |
+
filenames = [os.path.join(video_path, filename) for filename in filenames if not filename.startswith('.')]
|
38 |
+
filenames.sort()
|
39 |
+
|
40 |
+
for k, filename in enumerate(filenames):
|
41 |
+
print('Progress {:}/{:},'.format(k+1, len(filenames)), 'Processing', filename)
|
42 |
+
|
43 |
+
raw_video = cv2.VideoCapture(filename)
|
44 |
+
frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
45 |
+
frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS))
|
46 |
+
#length = int(raw_video.get(cv2.CAP_PROP_FRAME_COUNT))
|
47 |
+
output_width = frame_width * 2 + margin_width
|
48 |
+
|
49 |
+
filename = os.path.basename(filename)
|
50 |
+
|
51 |
+
# Find the size to resize
|
52 |
+
if "shortest_edge" in processor.size:
|
53 |
+
height = width = processor.size["shortest_edge"]
|
54 |
+
else:
|
55 |
+
height = processor.size["height"]
|
56 |
+
width = processor.size["width"]
|
57 |
+
resize_to = (height, width)
|
58 |
+
|
59 |
+
# F/Fs
|
60 |
+
clip_sample_rate = 1
|
61 |
+
# F
|
62 |
+
num_frames = 8
|
63 |
+
|
64 |
+
frames = []
|
65 |
+
features = []
|
66 |
+
|
67 |
+
# output_path = os.path.join(outdir, filename[:filename.rfind('.')] + '_video_depth.mp4')
|
68 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
|
69 |
+
output_path = tmpfile.name
|
70 |
+
#out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"avc1"), frame_rate, (output_width, frame_height))
|
71 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
72 |
+
out = cv2.VideoWriter(output_path, fourcc, frame_rate, (output_width, frame_height))
|
73 |
+
# count=0
|
74 |
+
|
75 |
+
while raw_video.isOpened():
|
76 |
+
ret, raw_frame = raw_video.read()
|
77 |
+
if not ret:
|
78 |
+
break
|
79 |
+
|
80 |
+
raw_frame = cv2.resize(raw_frame, resize_to)
|
81 |
+
frames.append(raw_frame)
|
82 |
+
|
83 |
+
# Find key frames by selecting frames with clip_sample_rate
|
84 |
+
key_frames = frames[::clip_sample_rate]
|
85 |
+
#print('total of frames after sample:', len(selected_frames))
|
86 |
+
|
87 |
+
# Remove redundant frames to make the number of frames can be divided by num_frames
|
88 |
+
num_redudant_frames = len(key_frames) - (len(key_frames) % num_frames)
|
89 |
+
|
90 |
+
# Final key frames
|
91 |
+
final_key_frames = key_frames[:num_redudant_frames]
|
92 |
+
#print('total of frames after remove redundant frames:', len(selected_frames))
|
93 |
+
|
94 |
+
for i in range(0, len(final_key_frames), num_frames):
|
95 |
+
if i % num_frames*50 == 0:
|
96 |
+
print(f"Loading {i}/{len(final_key_frames)}")
|
97 |
+
|
98 |
+
# Input clip to the model
|
99 |
+
input_frames = final_key_frames[i:i+num_frames]
|
100 |
+
# Extract features
|
101 |
+
batch_features = extract_features(input_frames, device, model, processor)
|
102 |
+
# Convert to numpy array to decrease the memory usage
|
103 |
+
batch_features = np.array(batch_features.cpu().detach().numpy())
|
104 |
+
features.extend(batch_features)
|
105 |
+
|
106 |
+
number_of_clusters = round(len(features)*0.15)
|
107 |
+
|
108 |
+
selected_frames = []
|
109 |
+
if encoder == "Kmeans":
|
110 |
+
selected_frames = kmeans(features, number_of_clusters)
|
111 |
+
elif encoder == "Sum of Squared Difference 01":
|
112 |
+
selected_frames = tt01(features, 400)
|
113 |
+
else:
|
114 |
+
selected_frames = tt02(features, 400)
|
115 |
+
|
116 |
+
video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (frames[0].shape[1], frames[0].shape[0]))
|
117 |
+
for idx in selected_frames:
|
118 |
+
video_writer.write(frames[idx])
|
119 |
+
|
120 |
+
raw_video.release()
|
121 |
+
video_writer.release()
|
122 |
+
print("Completed summarizing the video (wait for a moment to load).")
|
123 |
+
return output_path
|
124 |
|
125 |
css = """
|
126 |
#img-display-container {
|
|
|
134 |
}
|
135 |
"""
|
136 |
|
|
|
137 |
title = "# Video Summarization Demo"
|
138 |
description = """Video Summarization using Timesformer.
|
139 |
|
|
|
147 |
|
148 |
with gr.Row():
|
149 |
input_video = gr.Video(label="Input Video")
|
150 |
+
algorithm_type = gr.Dropdown(["Kmeans", "Sum of Squared Difference 01", "Sum of Squared Difference 02"], type="value", label='Algorithm')
|
151 |
submit = gr.Button("Submit")
|
152 |
processed_video = gr.Video(label="Summarized Video")
|
153 |
|
154 |
+
def on_submit(uploaded_video,algorithm_type):
|
155 |
|
156 |
# Process the video and get the path of the output video
|
157 |
#output_video_path = make_video(uploaded_video,encoder=model_type)
|
158 |
pass
|
159 |
#return output_video_path
|
160 |
|
161 |
+
submit.click(on_submit, inputs=[input_video, algorithm_type], outputs=processed_video)
|
162 |
|
163 |
#example_files = os.listdir('assets/examples_video')
|
164 |
#example_files.sort()
|
utils.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import TimesformerModel, VideoMAEImageProcessor
|
2 |
+
import torch
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
from torchvision.transforms import Lambda
|
6 |
+
from pytorchvideo.transforms import (
|
7 |
+
Normalize,
|
8 |
+
)
|
9 |
+
from torchvision.transforms import (
|
10 |
+
Lambda,
|
11 |
+
)
|
12 |
+
import os
|
13 |
+
from os.path import isfile, join, basename
|
14 |
+
|
15 |
+
def extract_features(frames, device, model, image_processor):
|
16 |
+
# Convert frames to tensor
|
17 |
+
frames_tensor = torch.stack([torch.from_numpy(frame) for frame in frames])
|
18 |
+
# Change the order of the tensor to (num_frames, channel, height, width)
|
19 |
+
frames_tensor = frames_tensor.permute(3, 0, 1, 2).to(device)
|
20 |
+
|
21 |
+
# Get the mean and std of the image processor
|
22 |
+
mean = image_processor.image_mean
|
23 |
+
std = image_processor.image_std
|
24 |
+
|
25 |
+
# Normalize frames
|
26 |
+
frames_tensor = Lambda(lambda x: x / 255.0)(frames_tensor)
|
27 |
+
frames_tensor = Normalize(mean, std)(frames_tensor)
|
28 |
+
|
29 |
+
# Change the order of the tensor to (num_frames, channel, height, width) and add a batch dimension
|
30 |
+
frames_tensor = frames_tensor.permute(1, 0, 2, 3).unsqueeze(0)
|
31 |
+
|
32 |
+
# Load the model to the device
|
33 |
+
model.to(device)
|
34 |
+
model.eval()
|
35 |
+
outputs = model(frames_tensor)
|
36 |
+
|
37 |
+
# Get the output after the Transformer Encoder (MLP head)
|
38 |
+
final_output = outputs[0][:, 0]
|
39 |
+
|
40 |
+
return final_output
|
41 |
+
|
42 |
+
def to_video(selected_frames, frames, output_path, video_fps):
|
43 |
+
|
44 |
+
print("MP4 Format.")
|
45 |
+
# Write the selected frames to a video
|
46 |
+
video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), video_fps, (frames[0].shape[1], frames[0].shape[0]))
|
47 |
+
|
48 |
+
# selected_frames is a list of indices of frames
|
49 |
+
for idx in selected_frames:
|
50 |
+
video_writer.write(frames[idx])
|
51 |
+
|
52 |
+
video_writer.release()
|
53 |
+
print("Completed summarizing the video (wait for a moment to load).")
|
54 |
+
|
55 |
+
def to_txt(selected_frames, output_path, clip_sample_rate):
|
56 |
+
# Write the selected frames to a txt file
|
57 |
+
|
58 |
+
with open(output_path, "w") as file:
|
59 |
+
for item in selected_frames:
|
60 |
+
file.write(str(item) + "\n")
|
61 |
+
|
62 |
+
print("Completed summarizing the txt (wait for a moment to load).")
|
63 |
+
|
64 |
+
def load_model():
|
65 |
+
try:
|
66 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
67 |
+
model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k600").to(DEVICE).eval()
|
68 |
+
processor=VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
|
69 |
+
return model, processor, DEVICE
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
print(e)
|
73 |
+
|
74 |
+
def sum_of_squared_difference(vector1, vector2):
|
75 |
+
squared_diff = np.square(vector1 - vector2)
|
76 |
+
sum_squared_diff = np.sum(squared_diff)
|
77 |
+
return sum_squared_diff
|