Hbvsa commited on
Commit
6d2982b
1 Parent(s): ea90ce6

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. Dockerfile +13 -0
  2. Utils/__init__.py +9 -0
  3. app.py +234 -0
  4. config.yaml +3 -0
  5. model.pth +3 -0
  6. requirements.txt +14 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+ EXPOSE 7860
11
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
12
+
13
+ CMD ["python", "app.py"]
Utils/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+
3
+ def get_video_properties(video_path):
4
+ cap = cv2.VideoCapture(video_path)
5
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
6
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
7
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
8
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
9
+ return cap, fps, width, height, fourcc
app.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ def run_commands():
4
+ commands = [
5
+ "apt-get update",
6
+ "apt-get install -y libgl1",
7
+ "git clone https://github.com/IDEA-Research/GroundingDINO.git",
8
+ "pip install -e ./GroundingDINO",
9
+ "cd GroundingDINO",
10
+ "mkdir weights",
11
+ "wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth",
12
+ "cd .."
13
+ ]
14
+
15
+ for command in commands:
16
+ try:
17
+ print(f"Running command: {command}")
18
+ result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
19
+ print(result.stdout.decode())
20
+ except subprocess.CalledProcessError as e:
21
+ print(f"Command '{command}' failed with error: {e.stderr.decode()}")
22
+
23
+ # Call the function to run the commands
24
+
25
+ if __name__ == "__main__":
26
+ run_commands()
27
+
28
+ from typing import List
29
+ from Utils import get_video_properties
30
+ from GroundingDINO.groundingdino.util.inference import load_model, predict
31
+ import cv2
32
+ import numpy as np
33
+ import torch
34
+ from PIL import Image
35
+ import GroundingDINO.groundingdino.datasets.transforms as T
36
+ from torchvision.ops import box_convert
37
+ from torchvision import transforms
38
+ from torch import nn
39
+ from os.path import dirname, abspath
40
+ import yaml
41
+ import supervision as sv
42
+ import gradio as gr
43
+ import spaces
44
+
45
+ class DinoVisionTransformerClassifier(nn.Module):
46
+ def __init__(self):
47
+ super(DinoVisionTransformerClassifier, self).__init__()
48
+ self.transformer = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14")
49
+ self.classifier = nn.Sequential(nn.Linear(384, 256), nn.ReLU(), nn.Linear(256, 2))
50
+
51
+ def forward(self, x):
52
+ x = self.transformer(x)
53
+ x = self.transformer.norm(x)
54
+ x = self.classifier(x)
55
+ return x
56
+
57
+
58
+ class ImageClassifier:
59
+
60
+ def __init__(self):
61
+ with open(f"{dirname(abspath(__file__))}/config.yaml", 'r') as f:
62
+ config = yaml.load(f, Loader=yaml.FullLoader)
63
+ labels = config["labels"]
64
+
65
+ self.labels = labels
66
+ self.dino = DinoVisionTransformerClassifier()
67
+ model_path = f"{dirname(abspath(__file__))}/model.pth"
68
+ state_dict = torch.load(model_path)
69
+ self.dino.load_state_dict(state_dict)
70
+
71
+ def preprocess(self, image: np.ndarray) -> torch.Tensor:
72
+ data_transforms = {
73
+ "test": transforms.Compose(
74
+ [
75
+ transforms.Resize((224, 224)),
76
+ transforms.ToTensor(),
77
+ transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
78
+ ]
79
+ )
80
+ }
81
+ image_pillow = Image.fromarray(image)
82
+ img_transformed = data_transforms['test'](image_pillow)
83
+
84
+ return img_transformed
85
+
86
+ def predict(self, image):
87
+ image = self.preprocess(image)
88
+ image = image.unsqueeze(0)
89
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
90
+ self.dino.to(device)
91
+ self.dino.eval()
92
+ with torch.no_grad():
93
+ output = self.dino(image.to(device))
94
+
95
+ logit, predicted = torch.max(output.data, 1)
96
+ return self.labels[predicted[0].item()], logit[0].item()
97
+
98
+
99
+ class VideoObjectDetection:
100
+
101
+ def __init__(self,
102
+ text_prompt: str
103
+ ):
104
+
105
+ self.text_prompt = text_prompt
106
+
107
+ def crop(self, frame, boxes):
108
+
109
+ h, w, _ = frame.shape
110
+ boxes = boxes * torch.Tensor([w, h, w, h])
111
+ xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
112
+ min_col, min_row, max_col, max_row = map(int, xyxy[0])
113
+ crop_image = frame[min_row:max_row, min_col:max_col, :]
114
+
115
+ return crop_image
116
+
117
+ def annotate(self,
118
+ image_source: np.ndarray,
119
+ boxes: torch.Tensor,
120
+ logits: torch.Tensor,
121
+ phrases: List[str],
122
+ frame_rgb: np.ndarray,
123
+ classifier) -> np.ndarray:
124
+
125
+ h, w, _ = image_source.shape
126
+ boxes = boxes * torch.Tensor([w, h, w, h])
127
+ xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
128
+ detections = sv.Detections(xyxy=xyxy)
129
+ print(xyxy.shape)
130
+ custom_labels = []
131
+ custom_logits = []
132
+
133
+ for box in xyxy:
134
+ min_col, min_row, max_col, max_row = map(int, box)
135
+ crop_image = frame_rgb[min_row:max_row, min_col:max_col, :]
136
+ label, logit = classifier.predict(crop_image)
137
+ print()
138
+ if logit >= 1:
139
+ custom_labels.append(label)
140
+ custom_logits.append(logit)
141
+ else:
142
+ custom_labels.append('unknown human face')
143
+ custom_logits.append(logit)
144
+
145
+ labels = [
146
+ f"{phrase} {logit:.2f}"
147
+ for phrase, logit
148
+ in zip(custom_labels, custom_logits)
149
+ ]
150
+
151
+ box_annotator = sv.BoxAnnotator()
152
+ annotated_frame = box_annotator.annotate(scene=image_source, detections=detections, labels=labels)
153
+ return annotated_frame
154
+
155
+ def preprocess_image(self, image: np.ndarray) -> torch.Tensor:
156
+ transform = T.Compose(
157
+ [
158
+ T.RandomResize([800], max_size=1333),
159
+ T.ToTensor(),
160
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
161
+ ]
162
+ )
163
+
164
+ image_pillow = Image.fromarray(image)
165
+ image_transformed, _ = transform(image_pillow, None)
166
+ return image_transformed
167
+
168
+ def generate_video(self, video_path) -> None:
169
+
170
+ # Load model, set up variables and get video properties
171
+ cap, fps, width, height, fourcc = get_video_properties(video_path)
172
+ model = load_model("GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
173
+ "GroundingDINO/weights/groundingdino_swint_ogc.pth")
174
+ predictor = ImageClassifier()
175
+ TEXT_PROMPT = self.text_prompt
176
+ BOX_TRESHOLD = 0.6
177
+ TEXT_TRESHOLD = 0.6
178
+
179
+ # Read video frames, crop image based on text prompt object detection and generate dataset_train
180
+ import time
181
+ frame_count = 0
182
+ delay = 1 / fps # Delay in seconds between frames
183
+ while cap.isOpened():
184
+ start_time = time.time()
185
+ ret, frame = cap.read()
186
+ if not ret:
187
+ break
188
+
189
+ if cv2.waitKey(1) & 0xff == ord('q'):
190
+ break
191
+
192
+ # Convert bgr frame to rgb frame to image to torch tensor transformed
193
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
194
+ image_transformed = self.preprocess_image(frame_rgb)
195
+
196
+ boxes, logits, phrases = predict(
197
+ model=model,
198
+ image=image_transformed,
199
+ caption=TEXT_PROMPT,
200
+ box_threshold=BOX_TRESHOLD,
201
+ text_threshold=TEXT_TRESHOLD
202
+ )
203
+
204
+ # Get boxes
205
+ if boxes.size()[0] > 0:
206
+ annotated_frame = self.annotate(image_source=frame, boxes=boxes, logits=logits,
207
+ phrases=phrases, frame_rgb=frame_rgb, classifier=predictor)
208
+ # cv2.imshow('Object detection', annotated_frame)
209
+ frame_rgb = cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB)
210
+
211
+ yield frame_rgb
212
+ elapsed_time = time.time() - start_time
213
+ time_to_wait = max(delay - elapsed_time, 0)
214
+ time.sleep(time_to_wait)
215
+
216
+ frame_count += 1
217
+
218
+
219
+ @spaces.GPU(duration=200)
220
+ def video_object_classification_pipeline():
221
+ video_annotator = VideoObjectDetection(
222
+ text_prompt='human face')
223
+
224
+ with gr.Blocks() as iface:
225
+ video_input = gr.Video(label="Upload Video")
226
+ run_button = gr.Button("Start Processing")
227
+ output_image = gr.Image(label="Classified video")
228
+ run_button.click(fn=video_annotator.generate_video, inputs=video_input,
229
+ outputs=output_image)
230
+
231
+ iface.launch(share=False, debug=True)
232
+
233
+ print("Só me falta a GPU")
234
+ video_object_classification_pipeline()
config.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ labels:
2
+ - jason
3
+ - lacy
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea6f40bb6ba86afa438e1a82c0de503995a2910b051339057dcf4f3075dd7bf1
3
+ size 88677278
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu121
2
+ torch
3
+ torchvision
4
+ torchaudio
5
+
6
+ --extra-index-url https://pypi.org/simple
7
+ transformers
8
+ addict
9
+ yapf
10
+ timm
11
+ opencv-python
12
+ supervision
13
+ pycocotools
14
+ gradio