Upload folder using huggingface_hub
Browse files- Dockerfile +13 -0
- Utils/__init__.py +9 -0
- app.py +234 -0
- config.yaml +3 -0
- model.pth +3 -0
- requirements.txt +14 -0
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
RUN useradd -m -u 1000 user
|
3 |
+
USER user
|
4 |
+
ENV HOME=/home/user \
|
5 |
+
PATH=/home/user/.local/bin:$PATH
|
6 |
+
WORKDIR $HOME/app
|
7 |
+
COPY --chown=user . $HOME/app
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
10 |
+
EXPOSE 7860
|
11 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
12 |
+
|
13 |
+
CMD ["python", "app.py"]
|
Utils/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
|
3 |
+
def get_video_properties(video_path):
|
4 |
+
cap = cv2.VideoCapture(video_path)
|
5 |
+
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
6 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
7 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
8 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
9 |
+
return cap, fps, width, height, fourcc
|
app.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
|
3 |
+
def run_commands():
|
4 |
+
commands = [
|
5 |
+
"apt-get update",
|
6 |
+
"apt-get install -y libgl1",
|
7 |
+
"git clone https://github.com/IDEA-Research/GroundingDINO.git",
|
8 |
+
"pip install -e ./GroundingDINO",
|
9 |
+
"cd GroundingDINO",
|
10 |
+
"mkdir weights",
|
11 |
+
"wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth",
|
12 |
+
"cd .."
|
13 |
+
]
|
14 |
+
|
15 |
+
for command in commands:
|
16 |
+
try:
|
17 |
+
print(f"Running command: {command}")
|
18 |
+
result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
19 |
+
print(result.stdout.decode())
|
20 |
+
except subprocess.CalledProcessError as e:
|
21 |
+
print(f"Command '{command}' failed with error: {e.stderr.decode()}")
|
22 |
+
|
23 |
+
# Call the function to run the commands
|
24 |
+
|
25 |
+
if __name__ == "__main__":
|
26 |
+
run_commands()
|
27 |
+
|
28 |
+
from typing import List
|
29 |
+
from Utils import get_video_properties
|
30 |
+
from GroundingDINO.groundingdino.util.inference import load_model, predict
|
31 |
+
import cv2
|
32 |
+
import numpy as np
|
33 |
+
import torch
|
34 |
+
from PIL import Image
|
35 |
+
import GroundingDINO.groundingdino.datasets.transforms as T
|
36 |
+
from torchvision.ops import box_convert
|
37 |
+
from torchvision import transforms
|
38 |
+
from torch import nn
|
39 |
+
from os.path import dirname, abspath
|
40 |
+
import yaml
|
41 |
+
import supervision as sv
|
42 |
+
import gradio as gr
|
43 |
+
import spaces
|
44 |
+
|
45 |
+
class DinoVisionTransformerClassifier(nn.Module):
|
46 |
+
def __init__(self):
|
47 |
+
super(DinoVisionTransformerClassifier, self).__init__()
|
48 |
+
self.transformer = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14")
|
49 |
+
self.classifier = nn.Sequential(nn.Linear(384, 256), nn.ReLU(), nn.Linear(256, 2))
|
50 |
+
|
51 |
+
def forward(self, x):
|
52 |
+
x = self.transformer(x)
|
53 |
+
x = self.transformer.norm(x)
|
54 |
+
x = self.classifier(x)
|
55 |
+
return x
|
56 |
+
|
57 |
+
|
58 |
+
class ImageClassifier:
|
59 |
+
|
60 |
+
def __init__(self):
|
61 |
+
with open(f"{dirname(abspath(__file__))}/config.yaml", 'r') as f:
|
62 |
+
config = yaml.load(f, Loader=yaml.FullLoader)
|
63 |
+
labels = config["labels"]
|
64 |
+
|
65 |
+
self.labels = labels
|
66 |
+
self.dino = DinoVisionTransformerClassifier()
|
67 |
+
model_path = f"{dirname(abspath(__file__))}/model.pth"
|
68 |
+
state_dict = torch.load(model_path)
|
69 |
+
self.dino.load_state_dict(state_dict)
|
70 |
+
|
71 |
+
def preprocess(self, image: np.ndarray) -> torch.Tensor:
|
72 |
+
data_transforms = {
|
73 |
+
"test": transforms.Compose(
|
74 |
+
[
|
75 |
+
transforms.Resize((224, 224)),
|
76 |
+
transforms.ToTensor(),
|
77 |
+
transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
|
78 |
+
]
|
79 |
+
)
|
80 |
+
}
|
81 |
+
image_pillow = Image.fromarray(image)
|
82 |
+
img_transformed = data_transforms['test'](image_pillow)
|
83 |
+
|
84 |
+
return img_transformed
|
85 |
+
|
86 |
+
def predict(self, image):
|
87 |
+
image = self.preprocess(image)
|
88 |
+
image = image.unsqueeze(0)
|
89 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
90 |
+
self.dino.to(device)
|
91 |
+
self.dino.eval()
|
92 |
+
with torch.no_grad():
|
93 |
+
output = self.dino(image.to(device))
|
94 |
+
|
95 |
+
logit, predicted = torch.max(output.data, 1)
|
96 |
+
return self.labels[predicted[0].item()], logit[0].item()
|
97 |
+
|
98 |
+
|
99 |
+
class VideoObjectDetection:
|
100 |
+
|
101 |
+
def __init__(self,
|
102 |
+
text_prompt: str
|
103 |
+
):
|
104 |
+
|
105 |
+
self.text_prompt = text_prompt
|
106 |
+
|
107 |
+
def crop(self, frame, boxes):
|
108 |
+
|
109 |
+
h, w, _ = frame.shape
|
110 |
+
boxes = boxes * torch.Tensor([w, h, w, h])
|
111 |
+
xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
|
112 |
+
min_col, min_row, max_col, max_row = map(int, xyxy[0])
|
113 |
+
crop_image = frame[min_row:max_row, min_col:max_col, :]
|
114 |
+
|
115 |
+
return crop_image
|
116 |
+
|
117 |
+
def annotate(self,
|
118 |
+
image_source: np.ndarray,
|
119 |
+
boxes: torch.Tensor,
|
120 |
+
logits: torch.Tensor,
|
121 |
+
phrases: List[str],
|
122 |
+
frame_rgb: np.ndarray,
|
123 |
+
classifier) -> np.ndarray:
|
124 |
+
|
125 |
+
h, w, _ = image_source.shape
|
126 |
+
boxes = boxes * torch.Tensor([w, h, w, h])
|
127 |
+
xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
|
128 |
+
detections = sv.Detections(xyxy=xyxy)
|
129 |
+
print(xyxy.shape)
|
130 |
+
custom_labels = []
|
131 |
+
custom_logits = []
|
132 |
+
|
133 |
+
for box in xyxy:
|
134 |
+
min_col, min_row, max_col, max_row = map(int, box)
|
135 |
+
crop_image = frame_rgb[min_row:max_row, min_col:max_col, :]
|
136 |
+
label, logit = classifier.predict(crop_image)
|
137 |
+
print()
|
138 |
+
if logit >= 1:
|
139 |
+
custom_labels.append(label)
|
140 |
+
custom_logits.append(logit)
|
141 |
+
else:
|
142 |
+
custom_labels.append('unknown human face')
|
143 |
+
custom_logits.append(logit)
|
144 |
+
|
145 |
+
labels = [
|
146 |
+
f"{phrase} {logit:.2f}"
|
147 |
+
for phrase, logit
|
148 |
+
in zip(custom_labels, custom_logits)
|
149 |
+
]
|
150 |
+
|
151 |
+
box_annotator = sv.BoxAnnotator()
|
152 |
+
annotated_frame = box_annotator.annotate(scene=image_source, detections=detections, labels=labels)
|
153 |
+
return annotated_frame
|
154 |
+
|
155 |
+
def preprocess_image(self, image: np.ndarray) -> torch.Tensor:
|
156 |
+
transform = T.Compose(
|
157 |
+
[
|
158 |
+
T.RandomResize([800], max_size=1333),
|
159 |
+
T.ToTensor(),
|
160 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
|
161 |
+
]
|
162 |
+
)
|
163 |
+
|
164 |
+
image_pillow = Image.fromarray(image)
|
165 |
+
image_transformed, _ = transform(image_pillow, None)
|
166 |
+
return image_transformed
|
167 |
+
|
168 |
+
def generate_video(self, video_path) -> None:
|
169 |
+
|
170 |
+
# Load model, set up variables and get video properties
|
171 |
+
cap, fps, width, height, fourcc = get_video_properties(video_path)
|
172 |
+
model = load_model("GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
|
173 |
+
"GroundingDINO/weights/groundingdino_swint_ogc.pth")
|
174 |
+
predictor = ImageClassifier()
|
175 |
+
TEXT_PROMPT = self.text_prompt
|
176 |
+
BOX_TRESHOLD = 0.6
|
177 |
+
TEXT_TRESHOLD = 0.6
|
178 |
+
|
179 |
+
# Read video frames, crop image based on text prompt object detection and generate dataset_train
|
180 |
+
import time
|
181 |
+
frame_count = 0
|
182 |
+
delay = 1 / fps # Delay in seconds between frames
|
183 |
+
while cap.isOpened():
|
184 |
+
start_time = time.time()
|
185 |
+
ret, frame = cap.read()
|
186 |
+
if not ret:
|
187 |
+
break
|
188 |
+
|
189 |
+
if cv2.waitKey(1) & 0xff == ord('q'):
|
190 |
+
break
|
191 |
+
|
192 |
+
# Convert bgr frame to rgb frame to image to torch tensor transformed
|
193 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
194 |
+
image_transformed = self.preprocess_image(frame_rgb)
|
195 |
+
|
196 |
+
boxes, logits, phrases = predict(
|
197 |
+
model=model,
|
198 |
+
image=image_transformed,
|
199 |
+
caption=TEXT_PROMPT,
|
200 |
+
box_threshold=BOX_TRESHOLD,
|
201 |
+
text_threshold=TEXT_TRESHOLD
|
202 |
+
)
|
203 |
+
|
204 |
+
# Get boxes
|
205 |
+
if boxes.size()[0] > 0:
|
206 |
+
annotated_frame = self.annotate(image_source=frame, boxes=boxes, logits=logits,
|
207 |
+
phrases=phrases, frame_rgb=frame_rgb, classifier=predictor)
|
208 |
+
# cv2.imshow('Object detection', annotated_frame)
|
209 |
+
frame_rgb = cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB)
|
210 |
+
|
211 |
+
yield frame_rgb
|
212 |
+
elapsed_time = time.time() - start_time
|
213 |
+
time_to_wait = max(delay - elapsed_time, 0)
|
214 |
+
time.sleep(time_to_wait)
|
215 |
+
|
216 |
+
frame_count += 1
|
217 |
+
|
218 |
+
|
219 |
+
@spaces.GPU(duration=200)
|
220 |
+
def video_object_classification_pipeline():
|
221 |
+
video_annotator = VideoObjectDetection(
|
222 |
+
text_prompt='human face')
|
223 |
+
|
224 |
+
with gr.Blocks() as iface:
|
225 |
+
video_input = gr.Video(label="Upload Video")
|
226 |
+
run_button = gr.Button("Start Processing")
|
227 |
+
output_image = gr.Image(label="Classified video")
|
228 |
+
run_button.click(fn=video_annotator.generate_video, inputs=video_input,
|
229 |
+
outputs=output_image)
|
230 |
+
|
231 |
+
iface.launch(share=False, debug=True)
|
232 |
+
|
233 |
+
print("Só me falta a GPU")
|
234 |
+
video_object_classification_pipeline()
|
config.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
labels:
|
2 |
+
- jason
|
3 |
+
- lacy
|
model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea6f40bb6ba86afa438e1a82c0de503995a2910b051339057dcf4f3075dd7bf1
|
3 |
+
size 88677278
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
--extra-index-url https://download.pytorch.org/whl/cu121
|
2 |
+
torch
|
3 |
+
torchvision
|
4 |
+
torchaudio
|
5 |
+
|
6 |
+
--extra-index-url https://pypi.org/simple
|
7 |
+
transformers
|
8 |
+
addict
|
9 |
+
yapf
|
10 |
+
timm
|
11 |
+
opencv-python
|
12 |
+
supervision
|
13 |
+
pycocotools
|
14 |
+
gradio
|