Spaces:

namnh2002
/

video-summarization_timesformer

Running

App Files Files Community

nam_nguyenhoai_AI commited on Jun 7

Commit

b0a48de

•

1 Parent(s): 987b643

Update algorithm

Browse files

Files changed (4) hide show

.gitignore +2 -0
algorithm.py +118 -0
app.py +123 -4
utils.py +77 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.mp4
2	+ assets/examples_Video

algorithm.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import faiss
+from sklearn.metrics import pairwise_distances_argmin_min
+import random
+import numpy as np
+from utils import *
+def kmeans(number_of_clusters, features):
+    # Cluster the frames using K-Means
+    # K-means from sklearn
+    #kmeans = KMeans(n_clusters=number_of_clusters, random_state=0).fit(features)
+    # K-means from faiss
+    ncentroids = number_of_clusters
+    niter = 10
+    verbose = True
+    x = features
+    # Take the first dimension of the first element of the list
+    dimension = x[0].shape[0]
+    kmeans = faiss.Kmeans(dimension, ncentroids, niter=niter, verbose=verbose)
+    kmeans.train(x)
+    #closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, features)
+    closest, _ = pairwise_distances_argmin_min(kmeans.centroids, x)
+    closest_clips_frames = []
+    for i in sorted(closest):
+        for idx in range(i*8, (i+1)*8):
+            closest_clips_frames.append(idx)
+    return closest_clips_frames
+def tt01(features, threshold):
+    i = 0
+    clips = []
+    # compare the sum of squared difference between clips i and j
+    for j in range(1, len(features)):
+        if sum_of_squared_difference(features[i], features[j]) > threshold:
+            clip = []
+            # add frames from clip i to j-1 to the clip list
+            for b in range(i*8, j*8):
+                clip.append(b)
+            # randomly select 15% of the frames from the clip list
+            random_num = round(len(clip)*0.15)
+            # sort the frames in the clip list to ensure the order of the frames
+            random_Frames = sorted(random.sample(clip, random_num))
+            i = j
+            clips.extend(random_Frames)
+    # add the last clip to the clip list
+    clip = []
+    if i==j:
+        for c in range(j*8, j*8+8):
+            clip.append(c)
+            random_num = round(len(clip)*0.15)
+            random_Frames = sorted(random.sample(clip, random_num))
+        #print("i == j")
+    else: # (i<j)
+        for c in range(i*8, (j+1)*8):
+            clip.append(c)
+            random_num = round(len(clip)*0.15)
+            random_Frames = sorted(random.sample(clip, random_num))
+        #print(f"{i} with {j}")
+    clips.extend(random_Frames)
+    return clips
+def tt02(features, threshold):
+    i = 0
+    previous = i
+    clips = []
+    #compare the sum of squared difference between clips j and previous
+    for j in range(1, len(features)):
+        if sum_of_squared_difference(features[previous], features[j]) > threshold:
+            clip = []
+            # add frames from clip i to j-1 to the clip list
+            for b in range(i*8, j*8):
+                clip.append(b)
+            # randomly select 15% of the frames from the clip list
+            random_num = round(len(clip)*0.15)
+            # sort the frames in the clip list to ensure the order of the frames
+            random_Frames = sorted(random.sample(clip, random_num))
+            i = j
+            clips.extend(random_Frames)
+        previous = j
+    # add the last clip to the clip list
+    clip = []
+    if i==j:
+        for c in range(j*8, j*8+8):
+            clip.append(c)
+            random_num = round(len(clip)*0.15)
+            random_Frames = sorted(random.sample(clip, random_num))
+    else: # (i<j)
+        for c in range(i*8, (j+1)*8):
+            clip.append(c)
+            random_num = round(len(clip)*0.15)
+            random_Frames = sorted(random.sample(clip, random_num))
+    clips.extend(random_Frames)
+    return clips

app.py CHANGED Viewed

@@ -1,6 +1,126 @@
 import gradio as gr
 import cv2
 import os
 css = """
 #img-display-container {
@@ -14,7 +134,6 @@ css = """
     }
 """
 title = "# Video Summarization Demo"
 description = """Video Summarization using Timesformer.
@@ -28,18 +147,18 @@ with gr.Blocks(css=css) as demo:
     with gr.Row():
         input_video = gr.Video(label="Input Video")
-        model_type = gr.Dropdown(["K-means", "Sum of Squared Difference 01", "Sum of Squared Difference 02"], type="value", label='Model Type')
     submit = gr.Button("Submit")
     processed_video = gr.Video(label="Summarized Video")
-    def on_submit(uploaded_video,model_type):
         # Process the video and get the path of the output video
         #output_video_path = make_video(uploaded_video,encoder=model_type)
         pass
         #return output_video_path
-    submit.click(on_submit, inputs=[input_video, model_type], outputs=processed_video)
     #example_files = os.listdir('assets/examples_video')
     #example_files.sort()

 import gradio as gr
 import cv2
 import os
+import spaces
+import tempfile
+from torchvision import transforms
+from torchvision.transforms import Compose
+import torch
+import numpy as np
+from PIL import Image
+import torch.nn.functional as F
+from pytorchvideo.transforms.functional import predict_depth
+from transformers import pipeline, TimesformerModel, VideoMAEImageProcessor
+from utils import *
+from algorithm import *
+@spaces.GPU
+def make_video(video_path, outdir='./summarized_video',encoder='Kmeans'):
+    if encoder not in ["Kmeans", "Sum of Squared Difference 01", "Sum of Squared Difference 02"]:
+        encoder = "Kmeans"
+    # nen them vao cac truong hop mo hinh khac
+    margin_width = 50
+    model, processor, device = load_model()
+    # total_params = sum(param.numel() for param in model.parameters())
+    # print('Total parameters: {:.2f}M'.format(total_params / 1e6))
+    if os.path.isfile(video_path):
+        if video_path.endswith('txt'):
+            with open(video_path, 'r') as f:
+                lines = f.read().splitlines()
+        else:
+            filenames = [video_path]
+    else:
+        filenames = os.listdir(video_path)
+        filenames = [os.path.join(video_path, filename) for filename in filenames if not filename.startswith('.')]
+        filenames.sort()
+    for k, filename in enumerate(filenames):
+        print('Progress {:}/{:},'.format(k+1, len(filenames)), 'Processing', filename)
+        raw_video = cv2.VideoCapture(filename)
+        frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS))
+        #length = int(raw_video.get(cv2.CAP_PROP_FRAME_COUNT))
+        output_width = frame_width * 2 + margin_width
+        filename = os.path.basename(filename)
+        # Find the size to resize
+        if "shortest_edge" in processor.size:
+            height = width = processor.size["shortest_edge"]
+        else:
+            height = processor.size["height"]
+            width = processor.size["width"]
+        resize_to = (height, width)
+        # F/Fs
+        clip_sample_rate = 1
+        # F
+        num_frames = 8
+        frames = []
+        features = []
+        # output_path = os.path.join(outdir, filename[:filename.rfind('.')] + '_video_depth.mp4')
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
+            output_path = tmpfile.name
+        #out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"avc1"), frame_rate, (output_width, frame_height))
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, frame_rate, (output_width, frame_height))
+        # count=0
+        while raw_video.isOpened():
+            ret, raw_frame = raw_video.read()
+            if not ret:
+                break
+            raw_frame = cv2.resize(raw_frame, resize_to)
+            frames.append(raw_frame)
+        # Find key frames by selecting frames with clip_sample_rate
+        key_frames = frames[::clip_sample_rate]
+        #print('total of frames after sample:', len(selected_frames))
+        # Remove redundant frames to make the number of frames can be divided by num_frames
+        num_redudant_frames = len(key_frames) - (len(key_frames) % num_frames)
+        # Final key frames
+        final_key_frames = key_frames[:num_redudant_frames]
+        #print('total of frames after remove redundant frames:', len(selected_frames))
+        for i in range(0, len(final_key_frames), num_frames):
+            if i % num_frames*50 == 0:
+                print(f"Loading {i}/{len(final_key_frames)}")
+        # Input clip to the model
+        input_frames = final_key_frames[i:i+num_frames]
+        # Extract features
+        batch_features = extract_features(input_frames, device, model, processor)
+        # Convert to numpy array to decrease the memory usage
+        batch_features = np.array(batch_features.cpu().detach().numpy())
+        features.extend(batch_features)
+        number_of_clusters = round(len(features)*0.15)
+        selected_frames = []
+        if encoder == "Kmeans":
+            selected_frames = kmeans(features, number_of_clusters)
+        elif encoder == "Sum of Squared Difference 01":
+            selected_frames = tt01(features, 400)
+        else:
+            selected_frames = tt02(features, 400)
+        video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (frames[0].shape[1], frames[0].shape[0]))
+        for idx in selected_frames:
+            video_writer.write(frames[idx])
+        raw_video.release()
+        video_writer.release()
+        print("Completed summarizing the video (wait for a moment to load).")
+        return output_path
 css = """
 #img-display-container {
     }
 """
 title = "# Video Summarization Demo"
 description = """Video Summarization using Timesformer.
     with gr.Row():
         input_video = gr.Video(label="Input Video")
+        algorithm_type = gr.Dropdown(["Kmeans", "Sum of Squared Difference 01", "Sum of Squared Difference 02"], type="value", label='Algorithm')
     submit = gr.Button("Submit")
     processed_video = gr.Video(label="Summarized Video")
+    def on_submit(uploaded_video,algorithm_type):
         # Process the video and get the path of the output video
         #output_video_path = make_video(uploaded_video,encoder=model_type)
         pass
         #return output_video_path
+    submit.click(on_submit, inputs=[input_video, algorithm_type], outputs=processed_video)
     #example_files = os.listdir('assets/examples_video')
     #example_files.sort()

utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from transformers import TimesformerModel, VideoMAEImageProcessor
+import torch
+import cv2
+import numpy as np
+from torchvision.transforms import Lambda
+from pytorchvideo.transforms import (
+    Normalize,
+)
+from torchvision.transforms import (
+    Lambda,
+)
+import os
+from os.path import isfile, join, basename
+def extract_features(frames, device, model, image_processor):
+    # Convert frames to tensor
+    frames_tensor = torch.stack([torch.from_numpy(frame) for frame in frames])
+    # Change the order of the tensor to (num_frames, channel, height, width)
+    frames_tensor = frames_tensor.permute(3, 0, 1, 2).to(device)
+    # Get the mean and std of the image processor
+    mean = image_processor.image_mean
+    std = image_processor.image_std
+    # Normalize frames
+    frames_tensor = Lambda(lambda x: x / 255.0)(frames_tensor)
+    frames_tensor = Normalize(mean, std)(frames_tensor)
+    # Change the order of the tensor to (num_frames, channel, height, width) and add a batch dimension
+    frames_tensor = frames_tensor.permute(1, 0, 2, 3).unsqueeze(0)
+    # Load the model to the device
+    model.to(device)
+    model.eval()
+    outputs = model(frames_tensor)
+    # Get the output after the Transformer Encoder (MLP head)
+    final_output = outputs[0][:, 0]
+    return final_output
+def to_video(selected_frames, frames, output_path, video_fps):
+    print("MP4 Format.")
+    # Write the selected frames to a video
+    video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), video_fps, (frames[0].shape[1], frames[0].shape[0]))
+    # selected_frames is a list of indices of frames
+    for idx in selected_frames:
+        video_writer.write(frames[idx])
+    video_writer.release()
+    print("Completed summarizing the video (wait for a moment to load).")
+def to_txt(selected_frames, output_path, clip_sample_rate):
+    # Write the selected frames to a txt file
+    with open(output_path, "w") as file:
+        for item in selected_frames:
+            file.write(str(item) + "\n")
+    print("Completed summarizing the txt (wait for a moment to load).")
+def load_model():
+    try:
+        DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+        model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k600").to(DEVICE).eval()
+        processor=VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
+        return model, processor, DEVICE
+    except Exception as e:
+        print(e)
+def sum_of_squared_difference(vector1, vector2):
+    squared_diff = np.square(vector1 - vector2)
+    sum_squared_diff = np.sum(squared_diff)
+    return sum_squared_diff