Spaces:
Runtime error
Runtime error
EduardoPacheco
commited on
Commit
•
ce78b5d
1
Parent(s):
0293c20
First commit
Browse files- .gitattributes +1 -0
- .gitignore +1 -0
- app.py +65 -0
- assets/dog-running.mp4 +3 -0
- utils.py +141 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
assets/dog-running.mp4 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
app.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import gradio as gr
|
3 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
|
4 |
+
|
5 |
+
import utils
|
6 |
+
|
7 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
8 |
+
model = AutoModel.from_pretrained('facebook/dinov2-base')
|
9 |
+
model.to(device);
|
10 |
+
|
11 |
+
def app_fn(
|
12 |
+
source_video: str,
|
13 |
+
batch_size: int,
|
14 |
+
threshold: float,
|
15 |
+
n_patches: int,
|
16 |
+
is_larger: bool,
|
17 |
+
interpolate: bool,
|
18 |
+
) -> str:
|
19 |
+
frames = utils.load_video_frames(source_video)
|
20 |
+
processed_frames = utils.process_video(
|
21 |
+
model=model,
|
22 |
+
video=frames,
|
23 |
+
batch_size=batch_size,
|
24 |
+
threshold=threshold,
|
25 |
+
n_patches=n_patches,
|
26 |
+
is_larger=is_larger,
|
27 |
+
interpolate=interpolate,
|
28 |
+
device=device
|
29 |
+
)
|
30 |
+
|
31 |
+
output_video = utils.create_video_from_frames_rgb(processed_frames)
|
32 |
+
|
33 |
+
return output_video
|
34 |
+
|
35 |
+
if __name__ == "__main__":
|
36 |
+
title = "🦖 DINOv2 Video 🦖"
|
37 |
+
with gr.Blocks() as demo:
|
38 |
+
with gr.Row():
|
39 |
+
source_video = gr.Video(label="Input Video", sources="upload", format="mp4")
|
40 |
+
output_video = gr.Video(label="Output Video")
|
41 |
+
with gr.Row():
|
42 |
+
batch_size = gr.Slider(minimum=1, maximum=30, step=1, value=4, label="Batch Size")
|
43 |
+
threshold = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label="Threshold")
|
44 |
+
n_patches = gr.Slider(minimum=20, maximum=40, step=1, value=30, label="Number of Patches")
|
45 |
+
is_larger = gr.Checkbox(label="Is Larger", value=True)
|
46 |
+
interpolate = gr.Checkbox(label="Interpolate", value=False)
|
47 |
+
|
48 |
+
btn = gr.Button("Process Video")
|
49 |
+
btn.click(
|
50 |
+
fn=app_fn,
|
51 |
+
inputs=[source_video, batch_size, threshold, n_patches, is_larger, interpolate],
|
52 |
+
outputs=[output_video]
|
53 |
+
)
|
54 |
+
examples = gr.Examples(
|
55 |
+
examples=[
|
56 |
+
["assets/dog-running.mp4", 30, 0.5, 40, True, False],
|
57 |
+
],
|
58 |
+
inputs=[source_video, batch_size, threshold, n_patches, is_larger, interpolate],
|
59 |
+
outputs=[output_video],
|
60 |
+
fn=app_fn,
|
61 |
+
cache_examples=True
|
62 |
+
)
|
63 |
+
|
64 |
+
demo.queue(max_size=5).launc()
|
65 |
+
|
assets/dog-running.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b36eaefc8b224d27f262f37d092f1438a2bbe6a997bb0077889a7cb5ab9911eb
|
3 |
+
size 3446481
|
utils.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
import cv2
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
from tqdm import tqdm
|
7 |
+
import supervision as sv
|
8 |
+
import torch.nn.functional as F
|
9 |
+
from transformers import AutoModel
|
10 |
+
from sklearn.decomposition import PCA
|
11 |
+
from torchvision import transforms as T
|
12 |
+
from sklearn.preprocessing import MinMaxScaler
|
13 |
+
|
14 |
+
|
15 |
+
def load_video_frames(video_path: str) -> List[np.ndarray]:
|
16 |
+
frames = []
|
17 |
+
for frame in tqdm(sv.get_video_frames_generator(source_path=video_path), unit=" frames"):
|
18 |
+
frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
19 |
+
|
20 |
+
return frames
|
21 |
+
|
22 |
+
def preprocess(image: np.ndarray, n_patches: int, device: str, patch_size: int = 14) -> torch.Tensor:
|
23 |
+
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
|
24 |
+
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
|
25 |
+
|
26 |
+
transform = T.Compose([
|
27 |
+
T.Resize((n_patches * patch_size, n_patches * patch_size)),
|
28 |
+
T.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
|
29 |
+
])
|
30 |
+
|
31 |
+
img = torch.from_numpy(image).type(torch.float).permute(2, 0, 1) / 255
|
32 |
+
img_tensor = transform(img).unsqueeze(0).to(device)
|
33 |
+
|
34 |
+
return img_tensor
|
35 |
+
|
36 |
+
|
37 |
+
def process_video(
|
38 |
+
model: AutoModel,
|
39 |
+
video: str | List[np.ndarray],
|
40 |
+
is_larger: bool = True,
|
41 |
+
batch_size: int = 4,
|
42 |
+
threshold: float = 0.5,
|
43 |
+
n_patches: int = 40,
|
44 |
+
interpolate: bool = False,
|
45 |
+
device: str = "cpu"
|
46 |
+
) -> List[np.ndarray]:
|
47 |
+
# NP = N_PATCHES
|
48 |
+
# P = PATCH_SIZE
|
49 |
+
if isinstance(video, str):
|
50 |
+
frames = load_video_frames(video)
|
51 |
+
else:
|
52 |
+
frames = video
|
53 |
+
patch_size = model.config.patch_size
|
54 |
+
|
55 |
+
original_height = frames[0].shape[0] # C, H, W
|
56 |
+
original_width = frames[0].shape[1] # C, H, W
|
57 |
+
|
58 |
+
final_frames = []
|
59 |
+
pca = PCA(n_components=3)
|
60 |
+
scaler = MinMaxScaler(clip=True)
|
61 |
+
|
62 |
+
for i in range(len(frames)//batch_size):
|
63 |
+
batch = frames[i*batch_size:batch_size*(i+1)]
|
64 |
+
pixel_values = [
|
65 |
+
preprocess(f, n_patches, device, patch_size).squeeze(0) for f in batch
|
66 |
+
]
|
67 |
+
pixel_values = torch.stack(pixel_values) # B, C, NP * P, NP * P
|
68 |
+
|
69 |
+
with torch.no_grad():
|
70 |
+
out = model(pixel_values=pixel_values)
|
71 |
+
|
72 |
+
features = out.last_hidden_state[:, 1:] # B, P * P, HIDDEN_DIM
|
73 |
+
features = features.cpu().numpy()
|
74 |
+
features = features.reshape(batch_size * n_patches * n_patches, -1) # B * P * P, HIDDEN_DIM
|
75 |
+
|
76 |
+
pca_features = pca.fit_transform(features)
|
77 |
+
pca_features = scaler.fit_transform(pca_features)
|
78 |
+
|
79 |
+
if is_larger:
|
80 |
+
pca_features_bg = pca_features[:, 0] > threshold
|
81 |
+
else:
|
82 |
+
pca_features_bg = pca_features[:, 0] < threshold
|
83 |
+
|
84 |
+
|
85 |
+
pca_features_fg = ~pca_features_bg
|
86 |
+
|
87 |
+
pca_features_fg_seg = pca.fit_transform(features[pca_features_fg])
|
88 |
+
|
89 |
+
pca_features_fg_seg = scaler.fit_transform(pca_features_fg_seg)
|
90 |
+
|
91 |
+
pca_features_rgb = np.zeros((batch_size * n_patches * n_patches, 3))
|
92 |
+
pca_features_rgb[pca_features_bg] = 0
|
93 |
+
pca_features_rgb[pca_features_fg] = pca_features_fg_seg
|
94 |
+
pca_features_rgb = pca_features_rgb.reshape(batch_size, n_patches, n_patches, 3)
|
95 |
+
|
96 |
+
if interpolate:
|
97 |
+
# transformed into torch tensor
|
98 |
+
pca_features_rgb = torch.from_numpy(pca_features_rgb) # B, P, P, 3
|
99 |
+
# reshaped to B, C, P, P
|
100 |
+
pca_features_rgb = pca_features_rgb.permute(0, 3, 1, 2)
|
101 |
+
# interpolate to B, C, H, W
|
102 |
+
# reshaped to B, H, W, C
|
103 |
+
# unbind to a list of len B with np.ndarray of shape H, W, C
|
104 |
+
pca_features_rgb = F.interpolate(
|
105 |
+
pca_features_rgb,
|
106 |
+
size=(original_height, original_width),
|
107 |
+
mode='bilinear',
|
108 |
+
align_corners=False
|
109 |
+
).permute(0, 2, 3, 1).unbind(0)
|
110 |
+
# Fixing range to np.uint8
|
111 |
+
else:
|
112 |
+
pca_features_rgb = [f for f in pca_features_rgb]
|
113 |
+
# Adding to final_frames list
|
114 |
+
final_frames.extend(pca_features_rgb)
|
115 |
+
|
116 |
+
return final_frames
|
117 |
+
|
118 |
+
|
119 |
+
def create_video_from_frames_rgb(
|
120 |
+
frame_list: List[np.ndarray],
|
121 |
+
output_filename: str = "animation.mp4",
|
122 |
+
fps: int = 15
|
123 |
+
) -> str:
|
124 |
+
# Get the shape of the frames to determine video dimensions
|
125 |
+
frame_height, frame_width, _ = frame_list[0].shape
|
126 |
+
|
127 |
+
# Define the codec and create a VideoWriter object
|
128 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v') # You can change the codec as needed
|
129 |
+
out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))
|
130 |
+
|
131 |
+
for frame in frame_list:
|
132 |
+
# Convert the frame from RGB to BGR
|
133 |
+
bgr_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
|
134 |
+
|
135 |
+
# Write the frame to the video file
|
136 |
+
out.write(bgr_frame)
|
137 |
+
|
138 |
+
# Release the VideoWriter object
|
139 |
+
out.release()
|
140 |
+
|
141 |
+
return output_filename
|